175 files changed, 6839 insertions, 5527 deletions
diff --git a/Documentation/networking/e100.txt b/Documentation/networking/e100.txt
index 4ef9f7cd5dc..944aa55e79f 100644
--- a/Documentation/networking/e100.txt
+++ b/Documentation/networking/e100.txt
@@ -1,16 +1,17 @@
 Linux* Base Driver for the Intel(R) PRO/100 Family of Adapters
 ==============================================================
 
-November 17, 2004
-
+November 15, 2005
 
 Contents
 ========
 
 - In This Release
 - Identifying Your Adapter
+- Building and Installation
 - Driver Configuration Parameters
 - Additional Configurations
+- Known Issues
 - Support
 
 
@@ -18,18 +19,30 @@ In This Release
 ===============
 
 This file describes the Linux* Base Driver for the Intel(R) PRO/100 Family of
-Adapters, version 3.3.x.  This driver supports 2.4.x and 2.6.x kernels. 
+Adapters. This driver includes support for Itanium(R)2-based systems.
+
+For questions related to hardware requirements, refer to the documentation
+supplied with your Intel PRO/100 adapter.
+
+The following features are now available in supported kernels:
+ - Native VLANs
+ - Channel Bonding (teaming)
+ - SNMP
+
+Channel Bonding documentation can be found in the Linux kernel source:
+/Documentation/networking/bonding.txt
+
 
 Identifying Your Adapter
 ========================
 
-For more information on how to identify your adapter, go to the Adapter & 
+For more information on how to identify your adapter, go to the Adapter &
 Driver ID Guide at:
 
   http://support.intel.com/support/network/adapter/pro100/21397.htm
 
-For the latest Intel network drivers for Linux, refer to the following 
-website. In the search field, enter your adapter name or type, or use the 
+For the latest Intel network drivers for Linux, refer to the following
+website. In the search field, enter your adapter name or type, or use the
 networking link on the left to search for your adapter:
 
   http://downloadfinder.intel.com/scripts-df/support_intel.asp
@@ -40,73 +53,75 @@ Driver Configuration Parameters
 The default value for each parameter is generally the recommended setting,
 unless otherwise noted.
 
-Rx Descriptors: Number of receive descriptors. A receive descriptor is a data 
-   structure that describes a receive buffer and its attributes to the network 
-   controller. The data in the descriptor is used by the controller to write 
-   data from the controller to host memory. In the 3.0.x driver the valid
-   range for this parameter is 64-256. The default value is 64. This parameter 
-   can be changed using the command 
- 
+Rx Descriptors: Number of receive descriptors. A receive descriptor is a data
+   structure that describes a receive buffer and its attributes to the network
+   controller. The data in the descriptor is used by the controller to write
+   data from the controller to host memory. In the 3.x.x driver the valid range
+   for this parameter is 64-256. The default value is 64. This parameter can be
+   changed using the command:
+
    ethtool -G eth? rx n, where n is the number of desired rx descriptors.
 
-Tx Descriptors: Number of transmit descriptors. A transmit descriptor is a
-   data structure that describes a transmit buffer and its attributes to the
-   network controller. The data in the descriptor is used by the controller to 
-   read data from the host memory to the controller. In the 3.0.x driver the 
-   valid range for this parameter is 64-256. The default value is 64. This 
-   parameter can be changed using the command 
+Tx Descriptors: Number of transmit descriptors. A transmit descriptor is a data
+   structure that describes a transmit buffer and its attributes to the network
+   controller. The data in the descriptor is used by the controller to read
+   data from the host memory to the controller. In the 3.x.x driver the valid
+   range for this parameter is 64-256. The default value is 64. This parameter
+   can be changed using the command:
 
    ethtool -G eth? tx n, where n is the number of desired tx descriptors.
 
-Speed/Duplex: The driver auto-negotiates the link speed and duplex settings by 
-   default. Ethtool can be used as follows to force speed/duplex. 
+Speed/Duplex: The driver auto-negotiates the link speed and duplex settings by
+   default. Ethtool can be used as follows to force speed/duplex.
 
    ethtool -s eth?  autoneg off speed {10|100} duplex {full|half}
 
    NOTE: setting the speed/duplex to incorrect values will cause the link to
    fail.
 
-Event Log Message Level:  The driver uses the message level flag to log events 
-   to syslog. The message level can be set at driver load time. It can also be 
-   set using the command
+Event Log Message Level:  The driver uses the message level flag to log events
+   to syslog. The message level can be set at driver load time. It can also be
+   set using the command:
 
    ethtool -s eth? msglvl n
 
+
 Additional Configurations
 =========================
 
   Configuring the Driver on Different Distributions
   -------------------------------------------------
 
-  Configuring a network driver to load properly when the system is started is 
-  distribution dependent. Typically, the configuration process involves adding 
-  an alias line to /etc/modules.conf as well as editing other system startup 
-  scripts and/or configuration files.  Many popular Linux distributions ship 
-  with tools to make these changes for you. To learn the proper way to 
-  configure a network device for your system, refer to your distribution 
-  documentation. If during this process you are asked for the driver or module 
-  name, the name for the Linux Base Driver for the Intel PRO/100 Family of 
-  Adapters is e100.
+  Configuring a network driver to load properly when the system is started is
+  distribution dependent. Typically, the configuration process involves adding
+  an alias line to /etc/modules.conf or /etc/modprobe.conf as well as editing
+  other system startup scripts and/or configuration files.  Many popular Linux
+  distributions ship with tools to make these changes for you. To learn the
+  proper way to configure a network device for your system, refer to your
+  distribution documentation.  If during this process you are asked for the
+  driver or module name, the name for the Linux Base Driver for the Intel
+  PRO/100 Family of Adapters is e100.
 
-  As an example, if you install the e100 driver for two PRO/100 adapters 
-  (eth0 and eth1), add the following to modules.conf:
+  As an example, if you install the e100 driver for two PRO/100 adapters
+  (eth0 and eth1), add the following to modules.conf or modprobe.conf:
 
        alias eth0 e100
        alias eth1 e100
 
   Viewing Link Messages
   ---------------------
-  In order to see link messages and other Intel driver information on your 
-  console, you must set the dmesg level up to six. This can be done by 
-  entering the following on the command line before loading the e100 driver: 
+  In order to see link messages and other Intel driver information on your
+  console, you must set the dmesg level up to six. This can be done by
+  entering the following on the command line before loading the e100 driver:
 
        dmesg -n 8
 
-  If you wish to see all messages issued by the driver, including debug 
+  If you wish to see all messages issued by the driver, including debug
   messages, set the dmesg level to eight.
 
   NOTE: This setting is not saved across reboots.
 
+
   Ethtool
   -------
 
@@ -114,29 +129,27 @@ Additional Configurations
   diagnostics, as well as displaying statistical information.  Ethtool
   version 1.6 or later is required for this functionality.
 
-  The latest release of ethtool can be found at:
-  http://sf.net/projects/gkernel.  
+  The latest release of ethtool can be found from
+  http://sourceforge.net/projects/gkernel.
 
-  NOTE: This driver uses mii support from the kernel. As a result, when 
-  there is no link, ethtool will report speed/duplex to be 10/half.
+  NOTE: Ethtool 1.6 only supports a limited set of ethtool options. Support
+  for a more complete ethtool feature set can be enabled by upgrading
+  ethtool to ethtool-1.8.1.
 
-  NOTE: Ethtool 1.6 only supports a limited set of ethtool options. Support 
-  for a more complete ethtool feature set can be enabled by upgrading 
-  ethtool to ethtool-1.8.1. 
 
   Enabling Wake on LAN* (WoL)
   ---------------------------
-  WoL is provided through the Ethtool* utility. Ethtool is included with Red 
-  Hat* 8.0. For other Linux distributions, download and install Ethtool from 
-  the following website: http://sourceforge.net/projects/gkernel. 
+  WoL is provided through the Ethtool* utility. Ethtool is included with Red
+  Hat* 8.0. For other Linux distributions, download and install Ethtool from
+  the following website: http://sourceforge.net/projects/gkernel.
 
-  For instructions on enabling WoL with Ethtool, refer to the Ethtool man
-  page.
+  For instructions on enabling WoL with Ethtool, refer to the Ethtool man page.
 
   WoL will be enabled on the system during the next shut down or reboot. For
-  this driver version, in order to enable WoL, the e100 driver must be 
+  this driver version, in order to enable WoL, the e100 driver must be
   loaded when shutting down or rebooting the system.
 
+
   NAPI
   ----
 
@@ -144,6 +157,25 @@ Additional Configurations
 
   See www.cyberus.ca/~hadi/usenix-paper.tgz for more information on NAPI.
 
+  Multiple Interfaces on Same Ethernet Broadcast Network
+  ------------------------------------------------------
+
+  Due to the default ARP behavior on Linux, it is not possible to have
+  one system on two IP networks in the same Ethernet broadcast domain
+  (non-partitioned switch) behave as expected. All Ethernet interfaces
+  will respond to IP traffic for any IP address assigned to the system.
+  This results in unbalanced receive traffic.
+
+  If you have multiple interfaces in a server, either turn on ARP
+  filtering by
+
+  (1) entering: echo 1 > /proc/sys/net/ipv4/conf/all/arp_filter
+      (this only works if your kernel's version is higher than 2.4.5), or
+
+  (2) installing the interfaces in separate broadcast domains (either
+      in different switches or in a switch partitioned to VLANs).
+
+
 Support
 =======
 
@@ -151,20 +183,24 @@ For general information, go to the Intel support website at:
 
     http://support.intel.com
 
+    or the Intel Wired Networking project hosted by Sourceforge at:
+
+    http://sourceforge.net/projects/e1000
+
 If an issue is identified with the released source code on the supported
-kernel with a supported adapter, email the specific information related to 
-the issue to linux.nics@intel.com.
+kernel with a supported adapter, email the specific information related to the
+issue to e1000-devel@lists.sourceforge.net.
 
 
 License
 =======
 
-This software program is released under the terms of a license agreement 
-between you ('Licensee') and Intel. Do not use or load this software or any 
-associated materials (collectively, the 'Software') until you have carefully 
-read the full terms and conditions of the LICENSE located in this software 
-package. By loading or using the Software, you agree to the terms of this 
-Agreement. If you do not agree with the terms of this Agreement, do not 
-install or use the Software.
+This software program is released under the terms of a license agreement
+between you ('Licensee') and Intel. Do not use or load this software or any
+associated materials (collectively, the 'Software') until you have carefully
+read the full terms and conditions of the file COPYING located in this software
+package. By loading or using the Software, you agree to the terms of this
+Agreement. If you do not agree with the terms of this Agreement, do not install
+or use the Software.
 
 * Other names and brands may be claimed as the property of others.
diff --git a/Documentation/networking/e1000.txt b/Documentation/networking/e1000.txt
index 2ebd4058d46..71fe15af356 100644
--- a/Documentation/networking/e1000.txt
+++ b/Documentation/networking/e1000.txt
@@ -1,7 +1,7 @@
 Linux* Base Driver for the Intel(R) PRO/1000 Family of Adapters
 ===============================================================
 
-November 17, 2004
+November 15, 2005
 
 
 Contents
@@ -20,254 +20,316 @@ In This Release
 ===============
 
 This file describes the Linux* Base Driver for the Intel(R) PRO/1000 Family
-of Adapters, version 5.x.x.  
+of Adapters.  This driver includes support for Itanium(R)2-based systems.
 
-For questions related to hardware requirements, refer to the documentation 
-supplied with your Intel PRO/1000 adapter. All hardware requirements listed 
+For questions related to hardware requirements, refer to the documentation
+supplied with your Intel PRO/1000 adapter. All hardware requirements listed
 apply to use with Linux.
 
-Native VLANs are now available with supported kernels.
+The following features are now available in supported kernels:
+ - Native VLANs
+ - Channel Bonding (teaming)
+ - SNMP
+
+Channel Bonding documentation can be found in the Linux kernel source:
+/Documentation/networking/bonding.txt
+
+The driver information previously displayed in the /proc filesystem is not
+supported in this release.  Alternatively, you can use ethtool (version 1.6
+or later), lspci, and ifconfig to obtain the same information.
+
+Instructions on updating ethtool can be found in the section "Additional
+Configurations" later in this document.
+
 
 Identifying Your Adapter
 ========================
 
-For more information on how to identify your adapter, go to the Adapter & 
+For more information on how to identify your adapter, go to the Adapter &
 Driver ID Guide at:
 
     http://support.intel.com/support/network/adapter/pro100/21397.htm
 
-For the latest Intel network drivers for Linux, refer to the following 
-website. In the search field, enter your adapter name or type, or use the 
+For the latest Intel network drivers for Linux, refer to the following
+website. In the search field, enter your adapter name or type, or use the
 networking link on the left to search for your adapter:
 
     http://downloadfinder.intel.com/scripts-df/support_intel.asp
 
-Command Line Parameters
-=======================
 
-If the driver is built as a module, the  following optional parameters are 
-used by entering them on the command line with the modprobe or insmod command
-using this syntax:
+Command Line Parameters =======================
+
+If the driver is built as a module, the  following optional parameters
+are used by entering them on the command line with the modprobe or insmod
+command using this syntax:
 
      modprobe e1000 [<option>=<VAL1>,<VAL2>,...]
 
-     insmod e1000 [<option>=<VAL1>,<VAL2>,...] 
+     insmod e1000 [<option>=<VAL1>,<VAL2>,...]
 
 For example, with two PRO/1000 PCI adapters, entering:
 
      insmod e1000 TxDescriptors=80,128
 
-loads the e1000 driver with 80 TX descriptors for the first adapter and 128 TX 
-descriptors for the second adapter.
+loads the e1000 driver with 80 TX descriptors for the first adapter and 128
+TX descriptors for the second adapter.
 
 The default value for each parameter is generally the recommended setting,
-unless otherwise noted. Also, if the driver is statically built into the
-kernel, the driver is loaded with the default values for all the parameters.
-Ethtool can be used to change some of the parameters at runtime.
+unless otherwise noted.
+
+NOTES:  For more information about the AutoNeg, Duplex, and Speed
+        parameters, see the "Speed and Duplex Configuration" section in
+        this document.
 
-    NOTES: For more information about the AutoNeg, Duplex, and Speed
-           parameters, see the "Speed and Duplex Configuration" section in 
-           this document.
+        For more information about the InterruptThrottleRate,
+        RxIntDelay, TxIntDelay, RxAbsIntDelay, and TxAbsIntDelay
+        parameters, see the application note at:
+        http://www.intel.com/design/network/applnots/ap450.htm
 
-           For more information about the InterruptThrottleRate, RxIntDelay, 
-           TxIntDelay, RxAbsIntDelay, and TxAbsIntDelay parameters, see the 
-           application note at:
-           http://www.intel.com/design/network/applnots/ap450.htm
+        A descriptor describes a data buffer and attributes related to
+        the data buffer. This information is accessed by the hardware.
 
-           A descriptor describes a data buffer and attributes related to the 
-           data buffer. This information is accessed by the hardware.
 
-AutoNeg (adapters using copper connections only)
-Valid Range: 0x01-0x0F, 0x20-0x2F
+AutoNeg
+-------
+(Supported only on adapters with copper connections)
+Valid Range:   0x01-0x0F, 0x20-0x2F
 Default Value: 0x2F
-    This parameter is a bit mask that specifies which speed and duplex
-    settings the board advertises. When this parameter is used, the Speed and
-    Duplex parameters must not be specified.
-    NOTE: Refer to the Speed and Duplex section of this readme for more 
-          information on the AutoNeg parameter.  
-
-Duplex (adapters using copper connections only)
-Valid Range: 0-2 (0=auto-negotiate, 1=half, 2=full)
+
+This parameter is a bit mask that specifies which speed and duplex
+settings the board advertises. When this parameter is used, the Speed
+and Duplex parameters must not be specified.
+
+NOTE:  Refer to the Speed and Duplex section of this readme for more
+       information on the AutoNeg parameter.
+
+
+Duplex
+------
+(Supported only on adapters with copper connections)
+Valid Range:   0-2 (0=auto-negotiate, 1=half, 2=full)
 Default Value: 0
-    Defines the direction in which data is allowed to flow. Can be either one 
-    or two-directional. If both Duplex and the link partner are set to auto-
-    negotiate, the board auto-detects the correct duplex. If the link partner
-    is forced (either full or half), Duplex defaults to half-duplex.
+
+Defines the direction in which data is allowed to flow. Can be either
+one or two-directional. If both Duplex and the link partner are set to
+auto-negotiate, the board auto-detects the correct duplex. If the link
+partner is forced (either full or half), Duplex defaults to half-duplex.
+
 
 FlowControl
-Valid Range: 0-3 (0=none, 1=Rx only, 2=Tx only, 3=Rx&Tx)
-Default: Read flow control settings from the EEPROM
-    This parameter controls the automatic generation(Tx) and response(Rx) to 
-    Ethernet PAUSE frames.
+----------
+Valid Range:   0-3 (0=none, 1=Rx only, 2=Tx only, 3=Rx&Tx)
+Default Value: Reads flow control settings from the EEPROM
+
+This parameter controls the automatic generation(Tx) and response(Rx)
+to Ethernet PAUSE frames.
+
 
 InterruptThrottleRate
-Valid Range: 100-100000 (0=off, 1=dynamic)
+---------------------
+(not supported on Intel 82542, 82543 or 82544-based adapters)
+Valid Range:   100-100000 (0=off, 1=dynamic)
 Default Value: 8000
-    This value represents the maximum number of interrupts per second the 
-    controller generates. InterruptThrottleRate is another setting used in 
-    interrupt moderation. Dynamic mode uses a heuristic algorithm to adjust 
-    InterruptThrottleRate based on the current traffic load.
-Un-supported Adapters: InterruptThrottleRate is NOT supported by 82542, 82543
-    or 82544-based adapters.
-
-    NOTE: InterruptThrottleRate takes precedence over the TxAbsIntDelay and 
-          RxAbsIntDelay parameters. In other words, minimizing the receive 
-          and/or transmit absolute delays does not force the controller to 
-          generate more interrupts than what the Interrupt Throttle Rate 
-          allows.
-    CAUTION: If you are using the Intel PRO/1000 CT Network Connection 
-             (controller 82547), setting InterruptThrottleRate to a value 
-             greater than 75,000, may hang (stop transmitting) adapters under 
-             certain network conditions. If this occurs a NETDEV WATCHDOG 
-             message is logged in the system event log. In addition, the 
-             controller is automatically reset, restoring the network 
-             connection. To eliminate the potential for the hang, ensure 
-             that InterruptThrottleRate is set no greater than 75,000 and is 
-             not set to 0.
-    NOTE: When e1000 is loaded with default settings and multiple adapters are 
-          in use simultaneously, the CPU utilization may increase non-linearly. 
-          In order to limit the CPU utilization without impacting the overall 
-          throughput, we recommend that you load the driver as follows:
-
-              insmod e1000.o InterruptThrottleRate=3000,3000,3000
-
-          This sets the InterruptThrottleRate to 3000 interrupts/sec for the 
-          first, second, and third instances of the driver. The range of 2000 to 
-          3000 interrupts per second works on a majority of systems and is a 
-          good starting point, but the optimal value will be platform-specific. 
-          If CPU utilization is not a concern, use RX_POLLING (NAPI) and default 
-          driver settings.
+
+This value represents the maximum number of interrupts per second the
+controller generates. InterruptThrottleRate is another setting used in
+interrupt moderation. Dynamic mode uses a heuristic algorithm to adjust
+InterruptThrottleRate based on the current traffic load.
+
+NOTE:  InterruptThrottleRate takes precedence over the TxAbsIntDelay and
+       RxAbsIntDelay parameters. In other words, minimizing the receive
+       and/or transmit absolute delays does not force the controller to
+       generate more interrupts than what the Interrupt Throttle Rate
+       allows.
+
+CAUTION:  If you are using the Intel PRO/1000 CT Network Connection
+          (controller 82547), setting InterruptThrottleRate to a value
+          greater than 75,000, may hang (stop transmitting) adapters
+          under certain network conditions. If this occurs a NETDEV
+          WATCHDOG message is logged in the system event log. In
+          addition, the controller is automatically reset, restoring
+          the network connection. To eliminate the potential for the
+          hang, ensure that InterruptThrottleRate is set no greater
+          than 75,000 and is not set to 0.
+
+NOTE:  When e1000 is loaded with default settings and multiple adapters
+       are in use simultaneously, the CPU utilization may increase non-
+       linearly. In order to limit the CPU utilization without impacting
+       the overall throughput, we recommend that you load the driver as
+       follows:
+
+           insmod e1000.o InterruptThrottleRate=3000,3000,3000
+
+       This sets the InterruptThrottleRate to 3000 interrupts/sec for
+       the first, second, and third instances of the driver. The range
+       of 2000 to 3000 interrupts per second works on a majority of
+       systems and is a good starting point, but the optimal value will
+       be platform-specific. If CPU utilization is not a concern, use
+       RX_POLLING (NAPI) and default driver settings.
+
 
 RxDescriptors
-Valid Range: 80-256 for 82542 and 82543-based adapters
-             80-4096 for all other supported adapters
+-------------
+Valid Range:   80-256 for 82542 and 82543-based adapters
+               80-4096 for all other supported adapters
 Default Value: 256
-    This value is the number of receive descriptors allocated by the driver. 
-    Increasing this value allows the driver to buffer more incoming packets. 
-    Each descriptor is 16 bytes.  A receive buffer is allocated for each
-    descriptor and can either be 2048 or 4096 bytes long, depending on the MTU 
 
-    setting. An incoming packet can span one or more receive descriptors. 
-    The maximum MTU size is 16110.
+This value specifies the number of receive descriptors allocated by the
+driver. Increasing this value allows the driver to buffer more incoming
+packets.  Each descriptor is 16 bytes.  A receive buffer is also
+allocated for each descriptor and is 2048.
 
-    NOTE: MTU designates the frame size. It only needs to be set for Jumbo 
-          Frames.
-    NOTE: Depending on the available system resources, the request for a
-    higher number of receive descriptors may be denied.  In this case,
-    use a lower number.
 
 RxIntDelay
-Valid Range: 0-65535 (0=off)
+----------
+Valid Range:   0-65535 (0=off)
 Default Value: 0
-    This value delays the generation of receive interrupts in units of 1.024 
-    microseconds.  Receive interrupt reduction can improve CPU efficiency if 
-    properly tuned for specific network traffic. Increasing this value adds 
-    extra latency to frame reception and can end up decreasing the throughput 
-    of TCP traffic. If the system is reporting dropped receives, this value 
-    may be set too high, causing the driver to run out of available receive 
-    descriptors.
-
-    CAUTION: When setting RxIntDelay to a value other than 0, adapters may 
-             hang (stop transmitting) under certain network conditions. If 
-             this occurs a NETDEV WATCHDOG message is logged in the system
-             event log. In addition, the controller is automatically reset, 
-             restoring the network connection. To eliminate the potential for
-             the hang ensure that RxIntDelay is set to 0.
-
-RxAbsIntDelay (82540, 82545 and later adapters only)
-Valid Range: 0-65535 (0=off)
+
+This value delays the generation of receive interrupts in units of 1.024
+microseconds.  Receive interrupt reduction can improve CPU efficiency if
+properly tuned for specific network traffic. Increasing this value adds
+extra latency to frame reception and can end up decreasing the throughput
+of TCP traffic. If the system is reporting dropped receives, this value
+may be set too high, causing the driver to run out of available receive
+descriptors.
+
+CAUTION:  When setting RxIntDelay to a value other than 0, adapters may
+          hang (stop transmitting) under certain network conditions. If
+          this occurs a NETDEV WATCHDOG message is logged in the system
+          event log. In addition, the controller is automatically reset,
+          restoring the network connection. To eliminate the potential
+          for the hang ensure that RxIntDelay is set to 0.
+
+
+RxAbsIntDelay
+-------------
+(This parameter is supported only on 82540, 82545 and later adapters.)
+Valid Range:   0-65535 (0=off)
 Default Value: 128
-    This value, in units of 1.024 microseconds, limits the delay in which a 
-    receive interrupt is generated. Useful only if RxIntDelay is non-zero, 
-    this value ensures that an interrupt is generated after the initial 
-    packet is received within the set amount of time.  Proper tuning,
-    along with RxIntDelay, may improve traffic throughput in specific network
-    conditions.
-
-Speed (adapters using copper connections only)
+
+This value, in units of 1.024 microseconds, limits the delay in which a
+receive interrupt is generated. Useful only if RxIntDelay is non-zero,
+this value ensures that an interrupt is generated after the initial
+packet is received within the set amount of time.  Proper tuning,
+along with RxIntDelay, may improve traffic throughput in specific network
+conditions.
+
+
+Speed
+-----
+(This parameter is supported only on adapters with copper connections.)
 Valid Settings: 0, 10, 100, 1000
-Default Value: 0 (auto-negotiate at all supported speeds)
-    Speed forces the line speed to the specified value in megabits per second
-    (Mbps). If this parameter is not specified or is set to 0 and the link 
-    partner is set to auto-negotiate, the board will auto-detect the correct 
-    speed. Duplex should also be set when Speed is set to either 10 or 100.
+Default Value:  0 (auto-negotiate at all supported speeds)
+
+Speed forces the line speed to the specified value in megabits per second
+(Mbps). If this parameter is not specified or is set to 0 and the link
+partner is set to auto-negotiate, the board will auto-detect the correct
+speed. Duplex should also be set when Speed is set to either 10 or 100.
+
 
 TxDescriptors
-Valid Range: 80-256 for 82542 and 82543-based adapters
-             80-4096 for all other supported adapters
+-------------
+Valid Range:   80-256 for 82542 and 82543-based adapters
+               80-4096 for all other supported adapters
 Default Value: 256
-    This value is the number of transmit descriptors allocated by the driver.
-    Increasing this value allows the driver to queue more transmits. Each 
-    descriptor is 16 bytes.
 
-    NOTE: Depending on the available system resources, the request for a
-    higher number of transmit descriptors may be denied.  In this case,
-    use a lower number.
+This value is the number of transmit descriptors allocated by the driver.
+Increasing this value allows the driver to queue more transmits. Each
+descriptor is 16 bytes.
+
+NOTE:  Depending on the available system resources, the request for a
+       higher number of transmit descriptors may be denied.  In this case,
+       use a lower number.
+
 
 TxIntDelay
-Valid Range: 0-65535 (0=off)
+----------
+Valid Range:   0-65535 (0=off)
 Default Value: 64
-    This value delays the generation of transmit interrupts in units of 
-    1.024 microseconds. Transmit interrupt reduction can improve CPU
-    efficiency if properly tuned for specific network traffic. If the
-    system is reporting dropped transmits, this value may be set too high
-    causing the driver to run out of available transmit descriptors.
-
-TxAbsIntDelay (82540, 82545 and later adapters only)
-Valid Range: 0-65535 (0=off)
+
+This value delays the generation of transmit interrupts in units of
+1.024 microseconds. Transmit interrupt reduction can improve CPU
+efficiency if properly tuned for specific network traffic. If the
+system is reporting dropped transmits, this value may be set too high
+causing the driver to run out of available transmit descriptors.
+
+
+TxAbsIntDelay
+-------------
+(This parameter is supported only on 82540, 82545 and later adapters.)
+Valid Range:   0-65535 (0=off)
 Default Value: 64
-    This value, in units of 1.024 microseconds, limits the delay in which a 
-    transmit interrupt is generated. Useful only if TxIntDelay is non-zero, 
-    this value ensures that an interrupt is generated after the initial 
-    packet is sent on the wire within the set amount of time.  Proper tuning,
-    along with TxIntDelay, may improve traffic throughput in specific 
-    network conditions.
-
-XsumRX (not available on the 82542-based adapter)
-Valid Range: 0-1
+
+This value, in units of 1.024 microseconds, limits the delay in which a
+transmit interrupt is generated. Useful only if TxIntDelay is non-zero,
+this value ensures that an interrupt is generated after the initial
+packet is sent on the wire within the set amount of time.  Proper tuning,
+along with TxIntDelay, may improve traffic throughput in specific
+network conditions.
+
+XsumRX
+------
+(This parameter is NOT supported on the 82542-based adapter.)
+Valid Range:   0-1
 Default Value: 1
-    A value of '1' indicates that the driver should enable IP checksum
-    offload for received packets (both UDP and TCP) to the adapter hardware.
+
+A value of '1' indicates that the driver should enable IP checksum
+offload for received packets (both UDP and TCP) to the adapter hardware.
+
 
 Speed and Duplex Configuration
 ==============================
 
-Three keywords are used to control the speed and duplex configuration. These 
-keywords are Speed, Duplex, and AutoNeg.
+Three keywords are used to control the speed and duplex configuration.
+These keywords are Speed, Duplex, and AutoNeg.
 
-If the board uses a fiber interface, these keywords are ignored, and the 
+If the board uses a fiber interface, these keywords are ignored, and the
 fiber interface board only links at 1000 Mbps full-duplex.
 
 For copper-based boards, the keywords interact as follows:
 
-  The default operation is auto-negotiate. The board advertises all supported
-  speed and duplex combinations, and it links at the highest common speed and
-  duplex mode IF the link partner is set to auto-negotiate.
+  The default operation is auto-negotiate. The board advertises all
+  supported speed and duplex combinations, and it links at the highest
+  common speed and duplex mode IF the link partner is set to auto-negotiate.
 
-  If Speed = 1000, limited auto-negotiation is enabled and only 1000 Mbps is
-  advertised (The 1000BaseT spec requires auto-negotiation.)
+  If Speed = 1000, limited auto-negotiation is enabled and only 1000 Mbps
+  is advertised (The 1000BaseT spec requires auto-negotiation.)
 
   If Speed = 10 or 100, then both Speed and Duplex should be set. Auto-
-  negotiation is disabled, and the AutoNeg parameter is ignored. Partner SHOULD
-  also be forced.
+  negotiation is disabled, and the AutoNeg parameter is ignored. Partner
+  SHOULD also be forced.
+
+The AutoNeg parameter is used when more control is required over the
+auto-negotiation process.  It should be used when you wish to control which
+speed and duplex combinations are advertised during the auto-negotiation
+process.
+
+The parameter may be specified as either a decimal or hexidecimal value as
+determined by the bitmap below.
 
-The AutoNeg parameter is used when more control is required over the auto-
-negotiation process. When this parameter is used, Speed and Duplex parameters 
-must not be specified. The following table describes supported values for the 
-AutoNeg parameter:
+Bit position   7      6      5       4       3      2      1       0
+Decimal Value  128    64     32      16      8      4      2       1
+Hex value      80     40     20      10      8      4      2       1
+Speed (Mbps)   N/A    N/A    1000    N/A     100    100    10      10
+Duplex                       Full            Full   Half   Full    Half
 
-Speed (Mbps)   		 1000      100    100    10     10
-Duplex                   Full      Full   Half   Full   Half
-Value (in base 16)       0x20      0x08   0x04   0x02   0x01
+Some examples of using AutoNeg:
 
-Example: insmod e1000 AutoNeg=0x03, loads e1000 and specifies (10 full duplex, 
-10 half duplex) for negotiation with the peer.
+  modprobe e1000 AutoNeg=0x01 (Restricts autonegotiation to 10 Half)
+  modprobe e1000 AutoNeg=1 (Same as above)
+  modprobe e1000 AutoNeg=0x02 (Restricts autonegotiation to 10 Full)
+  modprobe e1000 AutoNeg=0x03 (Restricts autonegotiation to 10 Half or 10 Full)
+  modprobe e1000 AutoNeg=0x04 (Restricts autonegotiation to 100 Half)
+  modprobe e1000 AutoNeg=0x05 (Restricts autonegotiation to 10 Half or 100
+  Half)
+  modprobe e1000 AutoNeg=0x020 (Restricts autonegotiation to 1000 Full)
+  modprobe e1000 AutoNeg=32 (Same as above)
 
-Note that setting AutoNeg does not guarantee that the board will link at the 
-highest specified speed or duplex mode, but the board will link at the 
-highest possible speed/duplex of the link partner IF the link partner is also
-set to auto-negotiate. If the link partner is forced speed/duplex, the 
-adapter MUST be forced to the same speed/duplex.
+Note that when this parameter is used, Speed and Duplex must not be specified.
+
+If the link partner is forced to a specific speed and duplex, then this
+parameter should not be used.  Instead, use the Speed and Duplex parameters
+previously mentioned to force the adapter to the same speed and duplex.
 
 
 Additional Configurations
@@ -276,19 +338,19 @@ Additional Configurations
   Configuring the Driver on Different Distributions
   -------------------------------------------------
 
-  Configuring a network driver to load properly when the system is started is
-  distribution dependent. Typically, the configuration process involves adding
-  an alias line to /etc/modules.conf as well as editing other system startup 
-  scripts and/or configuration files. Many popular Linux distributions ship 
-  with tools to make these changes for you. To learn the proper way to 
-  configure a network device for your system, refer to your distribution 
-  documentation. If during this process you are asked for the driver or module 
-  name, the name for the Linux Base Driver for the Intel PRO/1000 Family of 
-  Adapters is e1000.
+  Configuring a network driver to load properly when the system is started
+  is distribution dependent. Typically, the configuration process involves
+  adding an alias line to /etc/modules.conf or /etc/modprobe.conf as well
+  as editing other system startup scripts and/or configuration files. Many
+  popular Linux distributions ship with tools to make these changes for you.
+  To learn the proper way to configure a network device for your system,
+  refer to your distribution documentation. If during this process you are
+  asked for the driver or module name, the name for the Linux Base Driver
+  for the Intel PRO/1000 Family of Adapters is e1000.
 
-  As an example, if you install the e1000 driver for two PRO/1000 adapters 
-  (eth0 and eth1) and set the speed and duplex to 10full and 100half, add the 
-  following to modules.conf:
+  As an example, if you install the e1000 driver for two PRO/1000 adapters
+  (eth0 and eth1) and set the speed and duplex to 10full and 100half, add
+  the following to modules.conf or or modprobe.conf:
 
        alias eth0 e1000
        alias eth1 e1000
@@ -297,9 +359,9 @@ Additional Configurations
   Viewing Link Messages
   ---------------------
 
-  Link messages will not be displayed to the console if the distribution is 
-  restricting system messages. In order to see network driver link messages on 
-  your console, set dmesg to eight by entering the following:
+  Link messages will not be displayed to the console if the distribution is
+  restricting system messages. In order to see network driver link messages
+  on your console, set dmesg to eight by entering the following:
 
        dmesg -n 8
 
@@ -308,22 +370,42 @@ Additional Configurations
   Jumbo Frames
   ------------
 
-  The driver supports Jumbo Frames for all adapters except 82542-based 
-  adapters. Jumbo Frames support is enabled by changing the MTU to a value 
-  larger than the default of 1500. Use the ifconfig command to increase the 
-  MTU size. For example:
+  The driver supports Jumbo Frames for all adapters except 82542 and
+  82573-based adapters. Jumbo Frames support is enabled by changing the
+  MTU to a value larger than the default of 1500. Use the ifconfig command
+  to increase the MTU size. For example:
+
+       ifconfig eth<x> mtu 9000 up
+
+  This setting is not saved across reboots.  It can be made permanent if
+  you add:
+
+       MTU=9000
 
-        ifconfig ethx mtu 9000 up
+   to the file /etc/sysconfig/network-scripts/ifcfg-eth<x>.  This example
+   applies to the Red Hat distributions; other distributions may store this
+   setting in a different location.
 
-  The maximum MTU setting for Jumbo Frames is 16110. This value coincides 
-  with the maximum Jumbo Frames size of 16128.
+  Notes:
 
-  NOTE: Jumbo Frames are supported at 1000 Mbps only. Using Jumbo Frames at 
-  10 or 100 Mbps may result in poor performance or loss of link.
+  - To enable Jumbo Frames, increase the MTU size on the interface beyond
+    1500.
+  - The maximum MTU setting for Jumbo Frames is 16110. This value coincides
+    with the maximum Jumbo Frames size of 16128.
+  - Using Jumbo Frames at 10 or 100 Mbps may result in poor performance or
+    loss of link.
+  - Some Intel gigabit adapters that support Jumbo Frames have a frame size
+    limit of 9238 bytes, with a corresponding MTU size limit of 9216 bytes.
+    The adapters with this limitation are based on the Intel 82571EB and
+    82572EI controllers, which correspond to these product names:
+     Intel� PRO/1000 PT Dual Port Server Adapter
+     Intel� PRO/1000 PF Dual Port Server Adapter
+     Intel� PRO/1000 PT Server Adapter
+     Intel� PRO/1000 PT Desktop Adapter
+     Intel� PRO/1000 PF Server Adapter
 
+  - The Intel PRO/1000 PM Network Connection does not support jumbo frames.
 
-  NOTE: MTU designates the frame size. To enable Jumbo Frames, increase the
-  MTU size on the interface beyond 1500.
 
   Ethtool
   -------
@@ -333,32 +415,41 @@ Additional Configurations
   version 1.6 or later is required for this functionality.
 
   The latest release of ethtool can be found from
-  http://sf.net/projects/gkernel.  
+  http://sourceforge.net/projects/gkernel.
 
-  NOTE: Ethtool 1.6 only supports a limited set of ethtool options. Support 
-  for a more complete ethtool feature set can be enabled by upgrading 
-  ethtool to ethtool-1.8.1. 
+  NOTE: Ethtool 1.6 only supports a limited set of ethtool options. Support
+  for a more complete ethtool feature set can be enabled by upgrading
+  ethtool to ethtool-1.8.1.
 
   Enabling Wake on LAN* (WoL)
   ---------------------------
 
   WoL is configured through the Ethtool* utility. Ethtool is included with
-  all versions of Red Hat after Red Hat 7.2. For other Linux distributions, 
-  download and install Ethtool from the following website: 
+  all versions of Red Hat after Red Hat 7.2. For other Linux distributions,
+  download and install Ethtool from the following website:
   http://sourceforge.net/projects/gkernel.
 
-  For instructions on enabling WoL with Ethtool, refer to the website listed 
+  For instructions on enabling WoL with Ethtool, refer to the website listed
   above.
 
-  WoL will be enabled on the system during the next shut down or reboot. 
-  For this driver version, in order to enable WoL, the e1000 driver must be 
+  WoL will be enabled on the system during the next shut down or reboot.
+  For this driver version, in order to enable WoL, the e1000 driver must be
   loaded when shutting down or rebooting the system.
 
   NAPI
   ----
 
   NAPI (Rx polling mode) is supported in the e1000 driver. NAPI is enabled
-  or disabled based on the configuration of the kernel. 
+  or disabled based on the configuration of the kernel. To override
+  the default, use the following compile-time flags.
+
+  To enable NAPI, compile the driver module, passing in a configuration option:
+
+       make CFLAGS_EXTRA=-DE1000_NAPI install
+
+  To disable NAPI, compile the driver module, passing in a configuration option:
+
+       make CFLAGS_EXTRA=-DE1000_NO_NAPI install
 
   See www.cyberus.ca/~hadi/usenix-paper.tgz for more information on NAPI.
 
@@ -369,10 +460,85 @@ Known Issues
   Jumbo Frames System Requirement
   -------------------------------
 
-  Memory allocation failures have been observed on Linux systems with 64 MB 
-  of RAM or less that are running Jumbo Frames. If you are using Jumbo Frames,
-  your system may require more than the advertised minimum requirement of 64 MB
-  of system memory.
+  Memory allocation failures have been observed on Linux systems with 64 MB
+  of RAM or less that are running Jumbo Frames. If you are using Jumbo
+  Frames, your system may require more than the advertised minimum
+  requirement of 64 MB of system memory.
+
+  Performance Degradation with Jumbo Frames
+  -----------------------------------------
+
+  Degradation in throughput performance may be observed in some Jumbo frames
+  environments. If this is observed, increasing the application's socket
+  buffer size and/or increasing the /proc/sys/net/ipv4/tcp_*mem entry values
+  may help. See the specific application manual and
+  /usr/src/linux*/Documentation/
+  networking/ip-sysctl.txt for more details.
+
+  Jumbo frames on Foundry BigIron 8000 switch
+  -------------------------------------------
+  There is a known issue using Jumbo frames when connected to a Foundry
+  BigIron 8000 switch. This is a 3rd party limitation. If you experience
+  loss of packets, lower the MTU size.
+
+  Multiple Interfaces on Same Ethernet Broadcast Network
+  ------------------------------------------------------
+
+  Due to the default ARP behavior on Linux, it is not possible to have
+  one system on two IP networks in the same Ethernet broadcast domain
+  (non-partitioned switch) behave as expected. All Ethernet interfaces
+  will respond to IP traffic for any IP address assigned to the system.
+  This results in unbalanced receive traffic.
+
+  If you have multiple interfaces in a server, either turn on ARP
+  filtering by entering:
+
+      echo 1 > /proc/sys/net/ipv4/conf/all/arp_filter
+  (this only works if your kernel's version is higher than 2.4.5),
+
+  NOTE: This setting is not saved across reboots. The configuration
+  change can be made permanent by adding the line:
+      net.ipv4.conf.all.arp_filter = 1
+  to the file /etc/sysctl.conf
+
+        or,
+
+  install the interfaces in separate broadcast domains (either in
+  different switches or in a switch partitioned to VLANs).
+
+  82541/82547 can't link or are slow to link with some link partners
+  -----------------------------------------------------------------
+
+  There is a known compatibility issue with 82541/82547 and some
+  low-end switches where the link will not be established, or will
+  be slow to establish.  In particular, these switches are known to
+  be incompatible with 82541/82547:
+
+      Planex FXG-08TE
+      I-O Data ETG-SH8
+
+  To workaround this issue, the driver can be compiled with an override
+  of the PHY's master/slave setting.  Forcing master or forcing slave
+  mode will improve time-to-link.
+
+      # make EXTRA_CFLAGS=-DE1000_MASTER_SLAVE=<n>
+
+  Where <n> is:
+
+      0 = Hardware default
+      1 = Master mode
+      2 = Slave mode
+      3 = Auto master/slave
+
+  Disable rx flow control with ethtool
+  ------------------------------------
+
+  In order to disable receive flow control using ethtool, you must turn
+  off auto-negotiation on the same command line.
+
+  For example:
+
+     ethtool -A eth? autoneg off rx off
 
 
 Support
@@ -382,20 +548,24 @@ For general information, go to the Intel support website at:
 
     http://support.intel.com
 
+    or the Intel Wired Networking project hosted by Sourceforge at:
+
+    http://sourceforge.net/projects/e1000
+
 If an issue is identified with the released source code on the supported
-kernel with a supported adapter, email the specific information related to 
-the issue to linux.nics@intel.com.
+kernel with a supported adapter, email the specific information related
+to the issue to e1000-devel@lists.sourceforge.net
 
 
 License
 =======
 
-This software program is released under the terms of a license agreement 
-between you ('Licensee') and Intel. Do not use or load this software or any 
-associated materials (collectively, the 'Software') until you have carefully 
-read the full terms and conditions of the LICENSE located in this software 
-package. By loading or using the Software, you agree to the terms of this 
-Agreement. If you do not agree with the terms of this Agreement, do not 
+This software program is released under the terms of a license agreement
+between you ('Licensee') and Intel. Do not use or load this software or any
+associated materials (collectively, the 'Software') until you have carefully
+read the full terms and conditions of the file COPYING located in this software
+package. By loading or using the Software, you agree to the terms of this
+Agreement. If you do not agree with the terms of this Agreement, do not
 install or use the Software.
 
 * Other names and brands may be claimed as the property of others.
diff --git a/MAINTAINERS b/MAINTAINERS
index b0dc75a5e74..dd1351dc32b 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1349,10 +1349,10 @@ S:	Maintained
 INTEL PRO/100 ETHERNET SUPPORT
 P:	John Ronciak
 M:	john.ronciak@intel.com
-P:	Ganesh Venkatesan
-M:	ganesh.venkatesan@intel.com
 P:	Jesse Brandeburg
 M:	jesse.brandeburg@intel.com
+P:	Jeff Kirsher
+M:	jeffrey.t.kirsher@intel.com
 W:	http://sourceforge.net/projects/e1000/
 S:	Supported
 
@@ -1361,18 +1361,22 @@ P:	Jeb Cramer
 M:	cramerj@intel.com
 P:	John Ronciak
 M:	john.ronciak@intel.com
-P:	Ganesh Venkatesan
-M:	ganesh.venkatesan@intel.com
+P:	Jesse Brandeburg
+M:	jesse.brandeburg@intel.com
+P:	Jeff Kirsher
+M:	jeffrey.t.kirsher@intel.com
 W:	http://sourceforge.net/projects/e1000/
 S:	Supported
 
 INTEL PRO/10GbE SUPPORT
+P:	Jeff Kirsher
+M:	jeffrey.t.kirsher@intel.com
 P:	Ayyappan Veeraiyan
 M:	ayyappan.veeraiyan@intel.com
-P:	Ganesh Venkatesan
-M:	ganesh.venkatesan@intel.com
 P:	John Ronciak
 M:	john.ronciak@intel.com
+P:	Jesse Brandeburg
+M:	jesse.brandeburg@intel.com
 W:	http://sourceforge.net/projects/e1000/
 S:	Supported
 
diff --git a/arch/alpha/mm/init.c b/arch/alpha/mm/init.c
index 486d7945583..544ac5dc09e 100644
--- a/arch/alpha/mm/init.c
+++ b/arch/alpha/mm/init.c
@@ -357,7 +357,7 @@ free_reserved_mem(void *start, void *end)
 	void *__start = start;
 	for (; __start < end; __start += PAGE_SIZE) {
 		ClearPageReserved(virt_to_page(__start));
-		set_page_count(virt_to_page(__start), 1);
+		init_page_count(virt_to_page(__start));
 		free_page((long)__start);
 		totalram_pages++;
 	}
diff --git a/arch/arm/mm/consistent.c b/arch/arm/mm/consistent.c
index c2ee18d2075..8a1bfcd5008 100644
--- a/arch/arm/mm/consistent.c
+++ b/arch/arm/mm/consistent.c
@@ -223,6 +223,8 @@ __dma_alloc(struct device *dev, size_t size, dma_addr_t *handle, gfp_t gfp,
 		pte = consistent_pte[idx] + off;
 		c->vm_pages = page;
 
+		split_page(page, order);
+
 		/*
 		 * Set the "dma handle"
 		 */
@@ -231,7 +233,6 @@ __dma_alloc(struct device *dev, size_t size, dma_addr_t *handle, gfp_t gfp,
 		do {
 			BUG_ON(!pte_none(*pte));
 
-			set_page_count(page, 1);
 			/*
 			 * x86 does not mark the pages reserved...
 			 */
@@ -250,7 +251,6 @@ __dma_alloc(struct device *dev, size_t size, dma_addr_t *handle, gfp_t gfp,
 		 * Free the otherwise unused pages.
 		 */
 		while (page < end) {
-			set_page_count(page, 1);
 			__free_page(page);
 			page++;
 		}
diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c
index 8b276ee38ac..b0321e943b7 100644
--- a/arch/arm/mm/init.c
+++ b/arch/arm/mm/init.c
@@ -531,7 +531,7 @@ static inline void free_area(unsigned long addr, unsigned long end, char *s)
 	for (; addr < end; addr += PAGE_SIZE) {
 		struct page *page = virt_to_page(addr);
 		ClearPageReserved(page);
-		set_page_count(page, 1);
+		init_page_count(page);
 		free_page(addr);
 		totalram_pages++;
 	}
diff --git a/arch/arm26/mm/init.c b/arch/arm26/mm/init.c
index 1f09a9d0fb8..e3ecaa45374 100644
--- a/arch/arm26/mm/init.c
+++ b/arch/arm26/mm/init.c
@@ -324,7 +324,7 @@ static inline void free_area(unsigned long addr, unsigned long end, char *s)
 	for (; addr < end; addr += PAGE_SIZE) {
 		struct page *page = virt_to_page(addr);
 		ClearPageReserved(page);
-		set_page_count(page, 1);
+		init_page_count(page);
 		free_page(addr);
 		totalram_pages++;
 	}
diff --git a/arch/cris/mm/init.c b/arch/cris/mm/init.c
index 31a0018b525..b7842ff213a 100644
--- a/arch/cris/mm/init.c
+++ b/arch/cris/mm/init.c
@@ -216,7 +216,7 @@ free_initmem(void)
         addr = (unsigned long)(&__init_begin);
         for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) {
                 ClearPageReserved(virt_to_page(addr));
-                set_page_count(virt_to_page(addr), 1);
+                init_page_count(virt_to_page(addr));
                 free_page(addr);
                 totalram_pages++;
         }
diff --git a/arch/frv/kernel/frv_ksyms.c b/arch/frv/kernel/frv_ksyms.c
index 0f1c6cbc4f5..aa6b7d0a210 100644
--- a/arch/frv/kernel/frv_ksyms.c
+++ b/arch/frv/kernel/frv_ksyms.c
@@ -27,6 +27,7 @@ EXPORT_SYMBOL(__ioremap);
 EXPORT_SYMBOL(iounmap);
 
 EXPORT_SYMBOL(strnlen);
+EXPORT_SYMBOL(strpbrk);
 EXPORT_SYMBOL(strrchr);
 EXPORT_SYMBOL(strstr);
 EXPORT_SYMBOL(strchr);
diff --git a/arch/frv/mm/dma-alloc.c b/arch/frv/mm/dma-alloc.c
index 342823aad75..636b2f8b5d9 100644
--- a/arch/frv/mm/dma-alloc.c
+++ b/arch/frv/mm/dma-alloc.c
@@ -115,9 +115,7 @@ void *consistent_alloc(gfp_t gfp, size_t size, dma_addr_t *dma_handle)
 	 */
 	if (order > 0) {
 		struct page *rpage = virt_to_page(page);
-
-		for (i = 1; i < (1 << order); i++)
-			set_page_count(rpage + i, 1);
+		split_page(rpage, order);
 	}
 
 	err = 0;
diff --git a/arch/frv/mm/init.c b/arch/frv/mm/init.c
index 765088ea8a5..8899aa1a4f0 100644
--- a/arch/frv/mm/init.c
+++ b/arch/frv/mm/init.c
@@ -169,7 +169,7 @@ void __init mem_init(void)
 		struct page *page = &mem_map[pfn];
 
 		ClearPageReserved(page);
-		set_page_count(page, 1);
+		init_page_count(page);
 		__free_page(page);
 		totalram_pages++;
 	}
@@ -210,7 +210,7 @@ void __init free_initmem(void)
 	/* next to check that the page we free is not a partial page */
 	for (addr = start; addr < end; addr += PAGE_SIZE) {
 		ClearPageReserved(virt_to_page(addr));
-		set_page_count(virt_to_page(addr), 1);
+		init_page_count(virt_to_page(addr));
 		free_page(addr);
 		totalram_pages++;
 	}
@@ -230,7 +230,7 @@ void __init free_initrd_mem(unsigned long start, unsigned long end)
 	int pages = 0;
 	for (; start < end; start += PAGE_SIZE) {
 		ClearPageReserved(virt_to_page(start));
-		set_page_count(virt_to_page(start), 1);
+		init_page_count(virt_to_page(start));
 		free_page(start);
 		totalram_pages++;
 		pages++;
diff --git a/arch/h8300/kernel/h8300_ksyms.c b/arch/h8300/kernel/h8300_ksyms.c
index 5cc76efaf7a..69d6ad32d56 100644
--- a/arch/h8300/kernel/h8300_ksyms.c
+++ b/arch/h8300/kernel/h8300_ksyms.c
@@ -25,6 +25,7 @@ extern char h8300_debug_device[];
 /* platform dependent support */
 
 EXPORT_SYMBOL(strnlen);
+EXPORT_SYMBOL(strpbrk);
 EXPORT_SYMBOL(strrchr);
 EXPORT_SYMBOL(strstr);
 EXPORT_SYMBOL(strchr);
diff --git a/arch/h8300/mm/init.c b/arch/h8300/mm/init.c
index 1e0929ddc8c..09efc4b1f03 100644
--- a/arch/h8300/mm/init.c
+++ b/arch/h8300/mm/init.c
@@ -196,7 +196,7 @@ void free_initrd_mem(unsigned long start, unsigned long end)
 	int pages = 0;
 	for (; start < end; start += PAGE_SIZE) {
 		ClearPageReserved(virt_to_page(start));
-		set_page_count(virt_to_page(start), 1);
+		init_page_count(virt_to_page(start));
 		free_page(start);
 		totalram_pages++;
 		pages++;
@@ -219,7 +219,7 @@ free_initmem()
 	/* next to check that the page we free is not a partial page */
 	for (; addr + PAGE_SIZE < (unsigned long)(&__init_end); addr +=PAGE_SIZE) {
 		ClearPageReserved(virt_to_page(addr));
-		set_page_count(virt_to_page(addr), 1);
+		init_page_count(virt_to_page(addr));
 		free_page(addr);
 		totalram_pages++;
 	}
diff --git a/arch/i386/kernel/efi.c b/arch/i386/kernel/efi.c
index c9cad7ba0d2..aeabb419686 100644
--- a/arch/i386/kernel/efi.c
+++ b/arch/i386/kernel/efi.c
@@ -115,7 +115,7 @@ static void efi_call_phys_epilog(void)
 	unsigned long cr4;
 	struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, 0);
 
-	cpu_gdt_descr->address = __va(cpu_gdt_descr->address);
+	cpu_gdt_descr->address = (unsigned long)__va(cpu_gdt_descr->address);
 	load_gdt(cpu_gdt_descr);
 
 	cr4 = read_cr4();
diff --git a/arch/i386/kernel/smp.c b/arch/i386/kernel/smp.c
index 218d725a5a1..d134e9643a5 100644
--- a/arch/i386/kernel/smp.c
+++ b/arch/i386/kernel/smp.c
@@ -504,27 +504,23 @@ void unlock_ipi_call_lock(void)
 	spin_unlock_irq(&call_lock);
 }
 
-static struct call_data_struct * call_data;
-
-/*
- * this function sends a 'generic call function' IPI to all other CPUs
- * in the system.
- */
-
-int smp_call_function (void (*func) (void *info), void *info, int nonatomic,
-			int wait)
-/*
- * [SUMMARY] Run a function on all other CPUs.
- * <func> The function to run. This must be fast and non-blocking.
- * <info> An arbitrary pointer to pass to the function.
- * <nonatomic> currently unused.
- * <wait> If true, wait (atomically) until function has completed on other CPUs.
- * [RETURNS] 0 on success, else a negative status code. Does not return until
+static struct call_data_struct *call_data;
+
+/**
+ * smp_call_function(): Run a function on all other CPUs.
+ * @func: The function to run. This must be fast and non-blocking.
+ * @info: An arbitrary pointer to pass to the function.
+ * @nonatomic: currently unused.
+ * @wait: If true, wait (atomically) until function has completed on other CPUs.
+ *
+ * Returns 0 on success, else a negative status code. Does not return until
  * remote CPUs are nearly ready to execute <<func>> or are or have executed.
  *
  * You must not call this function with disabled interrupts or from a
  * hardware interrupt handler or from a bottom half handler.
  */
+int smp_call_function (void (*func) (void *info), void *info, int nonatomic,
+			int wait)
 {
 	struct call_data_struct data;
 	int cpus;
diff --git a/arch/i386/kernel/sys_i386.c b/arch/i386/kernel/sys_i386.c
index a4a61976ecb..8fdb1fb17a5 100644
--- a/arch/i386/kernel/sys_i386.c
+++ b/arch/i386/kernel/sys_i386.c
@@ -40,14 +40,13 @@ asmlinkage int sys_pipe(unsigned long __user * fildes)
 	return error;
 }
 
-/* common code for old and new mmaps */
-static inline long do_mmap2(
-	unsigned long addr, unsigned long len,
-	unsigned long prot, unsigned long flags,
-	unsigned long fd, unsigned long pgoff)
+asmlinkage long sys_mmap2(unsigned long addr, unsigned long len,
+			  unsigned long prot, unsigned long flags,
+			  unsigned long fd, unsigned long pgoff)
 {
 	int error = -EBADF;
-	struct file * file = NULL;
+	struct file *file = NULL;
+	struct mm_struct *mm = current->mm;
 
 	flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
 	if (!(flags & MAP_ANONYMOUS)) {
@@ -56,9 +55,9 @@ static inline long do_mmap2(
 			goto out;
 	}
 
-	down_write(&current->mm->mmap_sem);
+	down_write(&mm->mmap_sem);
 	error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff);
-	up_write(&current->mm->mmap_sem);
+	up_write(&mm->mmap_sem);
 
 	if (file)
 		fput(file);
@@ -66,13 +65,6 @@ out:
 	return error;
 }
 
-asmlinkage long sys_mmap2(unsigned long addr, unsigned long len,
-	unsigned long prot, unsigned long flags,
-	unsigned long fd, unsigned long pgoff)
-{
-	return do_mmap2(addr, len, prot, flags, fd, pgoff);
-}
-
 /*
  * Perform the select(nd, in, out, ex, tv) and mmap() system
  * calls. Linux/i386 didn't use to be able to handle more than
@@ -101,7 +93,8 @@ asmlinkage int old_mmap(struct mmap_arg_struct __user *arg)
 	if (a.offset & ~PAGE_MASK)
 		goto out;
 
-	err = do_mmap2(a.addr, a.len, a.prot, a.flags, a.fd, a.offset >> PAGE_SHIFT);
+	err = sys_mmap2(a.addr, a.len, a.prot, a.flags,
+			a.fd, a.offset >> PAGE_SHIFT);
 out:
 	return err;
 }
diff --git a/arch/i386/kernel/timers/timer_hpet.c b/arch/i386/kernel/timers/timer_hpet.c
index be242723c33..17a6fe7166e 100644
--- a/arch/i386/kernel/timers/timer_hpet.c
+++ b/arch/i386/kernel/timers/timer_hpet.c
@@ -46,7 +46,7 @@ static seqlock_t monotonic_lock = SEQLOCK_UNLOCKED;
  *
  *			-johnstul@us.ibm.com "math is hard, lets go shopping!"
  */
-static unsigned long cyc2ns_scale;
+static unsigned long cyc2ns_scale __read_mostly;
 #define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
 
 static inline void set_cyc2ns_scale(unsigned long cpu_khz)
diff --git a/arch/i386/kernel/timers/timer_tsc.c b/arch/i386/kernel/timers/timer_tsc.c
index a7f5a2aceba..5e41ee29c8c 100644
--- a/arch/i386/kernel/timers/timer_tsc.c
+++ b/arch/i386/kernel/timers/timer_tsc.c
@@ -74,7 +74,7 @@ late_initcall(start_lost_tick_compensation);
  *
  *			-johnstul@us.ibm.com "math is hard, lets go shopping!"
  */
-static unsigned long cyc2ns_scale; 
+static unsigned long cyc2ns_scale __read_mostly;
 #define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
 
 static inline void set_cyc2ns_scale(unsigned long cpu_khz)
diff --git a/arch/i386/mm/hugetlbpage.c b/arch/i386/mm/hugetlbpage.c
index d524127c9af..a7d89158541 100644
--- a/arch/i386/mm/hugetlbpage.c
+++ b/arch/i386/mm/hugetlbpage.c
@@ -48,18 +48,6 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
 	return (pte_t *) pmd;
 }
 
-/*
- * This function checks for proper alignment of input addr and len parameters.
- */
-int is_aligned_hugepage_range(unsigned long addr, unsigned long len)
-{
-	if (len & ~HPAGE_MASK)
-		return -EINVAL;
-	if (addr & ~HPAGE_MASK)
-		return -EINVAL;
-	return 0;
-}
-
 #if 0	/* This is just for testing */
 struct page *
 follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c
index 2700f01994b..7ba55a6e2db 100644
--- a/arch/i386/mm/init.c
+++ b/arch/i386/mm/init.c
@@ -270,7 +270,7 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base)
 
 static void __meminit free_new_highpage(struct page *page)
 {
-	set_page_count(page, 1);
+	init_page_count(page);
 	__free_page(page);
 	totalhigh_pages++;
 }
@@ -727,7 +727,7 @@ void free_initmem(void)
 	addr = (unsigned long)(&__init_begin);
 	for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) {
 		ClearPageReserved(virt_to_page(addr));
-		set_page_count(virt_to_page(addr), 1);
+		init_page_count(virt_to_page(addr));
 		memset((void *)addr, 0xcc, PAGE_SIZE);
 		free_page(addr);
 		totalram_pages++;
@@ -766,7 +766,7 @@ void free_initrd_mem(unsigned long start, unsigned long end)
 		printk (KERN_INFO "Freeing initrd memory: %ldk freed\n", (end - start) >> 10);
 	for (; start < end; start += PAGE_SIZE) {
 		ClearPageReserved(virt_to_page(start));
-		set_page_count(virt_to_page(start), 1);
+		init_page_count(virt_to_page(start));
 		free_page(start);
 		totalram_pages++;
 	}
diff --git a/arch/i386/mm/pageattr.c b/arch/i386/mm/pageattr.c
index d0cadb33b54..92c3d9f0e73 100644
--- a/arch/i386/mm/pageattr.c
+++ b/arch/i386/mm/pageattr.c
@@ -51,6 +51,13 @@ static struct page *split_large_page(unsigned long address, pgprot_t prot,
 	if (!base) 
 		return NULL;
 
+	/*
+	 * page_private is used to track the number of entries in
+	 * the page table page that have non standard attributes.
+	 */
+	SetPagePrivate(base);
+	page_private(base) = 0;
+
 	address = __pa(address);
 	addr = address & LARGE_PAGE_MASK; 
 	pbase = (pte_t *)page_address(base);
@@ -143,11 +150,12 @@ __change_page_attr(struct page *page, pgprot_t prot)
 				return -ENOMEM;
 			set_pmd_pte(kpte,address,mk_pte(split, ref_prot));
 			kpte_page = split;
-		}	
-		get_page(kpte_page);
+		}
+		page_private(kpte_page)++;
 	} else if ((pte_val(*kpte) & _PAGE_PSE) == 0) { 
 		set_pte_atomic(kpte, mk_pte(page, PAGE_KERNEL));
-		__put_page(kpte_page);
+		BUG_ON(page_private(kpte_page) == 0);
+		page_private(kpte_page)--;
 	} else
 		BUG();
 
@@ -157,10 +165,8 @@ __change_page_attr(struct page *page, pgprot_t prot)
 	 * replace it with a largepage.
 	 */
 	if (!PageReserved(kpte_page)) {
-		/* memleak and potential failed 2M page regeneration */
-		BUG_ON(!page_count(kpte_page));
-
-		if (cpu_has_pse && (page_count(kpte_page) == 1)) {
+		if (cpu_has_pse && (page_private(kpte_page) == 0)) {
+			ClearPagePrivate(kpte_page);
 			list_add(&kpte_page->lru, &df_list);
 			revert_page(kpte_page, address);
 		}
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index a85ea9d37f0..ff7ae6b664e 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -271,6 +271,25 @@ config SCHED_SMT
 	  Intel IA64 chips with MultiThreading at a cost of slightly increased
 	  overhead in some places. If unsure say N here.
 
+config PERMIT_BSP_REMOVE
+	bool "Support removal of Bootstrap Processor"
+	depends on HOTPLUG_CPU
+	default n
+	---help---
+	Say Y here if your platform SAL will support removal of BSP with HOTPLUG_CPU
+	support. 
+
+config FORCE_CPEI_RETARGET
+	bool "Force assumption that CPEI can be re-targetted"
+	depends on PERMIT_BSP_REMOVE
+	default n
+	---help---
+	Say Y if you need to force the assumption that CPEI can be re-targetted to
+	any cpu in the system. This hint is available via ACPI 3.0 specifications.
+	Tiger4 systems are capable of re-directing CPEI to any CPU other than BSP.
+	This option it useful to enable this feature on older BIOS's as well.
+	You can also enable this by using boot command line option force_cpei=1.
+
 config PREEMPT
 	bool "Preemptible Kernel"
         help
diff --git a/arch/ia64/configs/tiger_defconfig b/arch/ia64/configs/tiger_defconfig
index 125568118b8..766bf495543 100644
--- a/arch/ia64/configs/tiger_defconfig
+++ b/arch/ia64/configs/tiger_defconfig
@@ -116,6 +116,8 @@ CONFIG_FORCE_MAX_ZONEORDER=17
 CONFIG_SMP=y
 CONFIG_NR_CPUS=4
 CONFIG_HOTPLUG_CPU=y
+CONFIG_PERMIT_BSP_REMOVE=y
+CONFIG_FORCE_CPEI_RETARGET=y
 # CONFIG_SCHED_SMT is not set
 # CONFIG_PREEMPT is not set
 CONFIG_SELECT_MEMORY_MODEL=y
diff --git a/arch/ia64/kernel/acpi.c b/arch/ia64/kernel/acpi.c
index ecd44bdc839..4722ec51c70 100644
--- a/arch/ia64/kernel/acpi.c
+++ b/arch/ia64/kernel/acpi.c
@@ -284,19 +284,24 @@ acpi_parse_plat_int_src(acpi_table_entry_header * header,
 	return 0;
 }
 
+#ifdef CONFIG_HOTPLUG_CPU
 unsigned int can_cpei_retarget(void)
 {
 	extern int cpe_vector;
+	extern unsigned int force_cpei_retarget;
 
 	/*
 	 * Only if CPEI is supported and the override flag
 	 * is present, otherwise return that its re-targettable
 	 * if we are in polling mode.
 	 */
-	if (cpe_vector > 0 && !acpi_cpei_override)
-		return 0;
-	else
-		return 1;
+	if (cpe_vector > 0) {
+		if (acpi_cpei_override || force_cpei_retarget)
+			return 1;
+		else
+			return 0;
+	}
+	return 1;
 }
 
 unsigned int is_cpu_cpei_target(unsigned int cpu)
@@ -315,6 +320,7 @@ void set_cpei_target_cpu(unsigned int cpu)
 {
 	acpi_cpei_phys_cpuid = cpu_physical_id(cpu);
 }
+#endif
 
 unsigned int get_cpei_target_cpu(void)
 {
diff --git a/arch/ia64/kernel/entry.S b/arch/ia64/kernel/entry.S
index 930fdfca6dd..0e3eda99e54 100644
--- a/arch/ia64/kernel/entry.S
+++ b/arch/ia64/kernel/entry.S
@@ -1102,9 +1102,6 @@ skip_rbs_switch:
 	st8 [r2]=r8
 	st8 [r3]=r10
 .work_pending:
-	tbit.nz p6,p0=r31,TIF_SIGDELAYED		// signal delayed from  MCA/INIT/NMI/PMI context?
-(p6)	br.cond.sptk.few .sigdelayed
-	;;
 	tbit.z p6,p0=r31,TIF_NEED_RESCHED		// current_thread_info()->need_resched==0?
 (p6)	br.cond.sptk.few .notify
 #ifdef CONFIG_PREEMPT
@@ -1131,17 +1128,6 @@ skip_rbs_switch:
 (pLvSys)br.cond.sptk.few  .work_pending_syscall_end
 	br.cond.sptk.many .work_processed_kernel	// don't re-check
 
-// There is a delayed signal that was detected in MCA/INIT/NMI/PMI context where
-// it could not be delivered.  Deliver it now.  The signal might be for us and
-// may set TIF_SIGPENDING, so redrive ia64_leave_* after processing the delayed
-// signal.
-
-.sigdelayed:
-	br.call.sptk.many rp=do_sigdelayed
-	cmp.eq p6,p0=r0,r0				// p6 <- 1, always re-check
-(pLvSys)br.cond.sptk.few  .work_pending_syscall_end
-	br.cond.sptk.many .work_processed_kernel	// re-check
-
 .work_pending_syscall_end:
 	adds r2=PT(R8)+16,r12
 	adds r3=PT(R10)+16,r12
diff --git a/arch/ia64/kernel/iosapic.c b/arch/ia64/kernel/iosapic.c
index 574084f343f..8832c553230 100644
--- a/arch/ia64/kernel/iosapic.c
+++ b/arch/ia64/kernel/iosapic.c
@@ -631,6 +631,7 @@ get_target_cpu (unsigned int gsi, int vector)
 {
 #ifdef CONFIG_SMP
 	static int cpu = -1;
+	extern int cpe_vector;
 
 	/*
 	 * In case of vector shared by multiple RTEs, all RTEs that
@@ -653,6 +654,11 @@ get_target_cpu (unsigned int gsi, int vector)
 	if (!cpu_online(smp_processor_id()))
 		return cpu_physical_id(smp_processor_id());
 
+#ifdef CONFIG_ACPI
+	if (cpe_vector > 0 && vector == IA64_CPEP_VECTOR)
+		return get_cpei_target_cpu();
+#endif
+
 #ifdef CONFIG_NUMA
 	{
 		int num_cpus, cpu_index, iosapic_index, numa_cpu, i = 0;
diff --git a/arch/ia64/kernel/irq.c b/arch/ia64/kernel/irq.c
index d33244c3275..5ce908ef9c9 100644
--- a/arch/ia64/kernel/irq.c
+++ b/arch/ia64/kernel/irq.c
@@ -163,8 +163,19 @@ void fixup_irqs(void)
 {
 	unsigned int irq;
 	extern void ia64_process_pending_intr(void);
+	extern void ia64_disable_timer(void);
+	extern volatile int time_keeper_id;
+
+	ia64_disable_timer();
+
+	/*
+	 * Find a new timesync master
+	 */
+	if (smp_processor_id() == time_keeper_id) {
+		time_keeper_id = first_cpu(cpu_online_map);
+		printk ("CPU %d is now promoted to time-keeper master\n", time_keeper_id);
+	}
 
-	ia64_set_itv(1<<16);
 	/*
 	 * Phase 1: Locate irq's bound to this cpu and
 	 * relocate them for cpu removal.
diff --git a/arch/ia64/kernel/mca.c b/arch/ia64/kernel/mca.c
index ee7eec9ee57..b57e723f194 100644
--- a/arch/ia64/kernel/mca.c
+++ b/arch/ia64/kernel/mca.c
@@ -281,14 +281,10 @@ ia64_mca_log_sal_error_record(int sal_info_type)
 		ia64_sal_clear_state_info(sal_info_type);
 }
 
-/*
- * platform dependent error handling
- */
-#ifndef PLATFORM_MCA_HANDLERS
-
 #ifdef CONFIG_ACPI
 
 int cpe_vector = -1;
+int ia64_cpe_irq = -1;
 
 static irqreturn_t
 ia64_mca_cpe_int_handler (int cpe_irq, void *arg, struct pt_regs *ptregs)
@@ -377,8 +373,6 @@ ia64_mca_register_cpev (int cpev)
 }
 #endif /* CONFIG_ACPI */
 
-#endif /* PLATFORM_MCA_HANDLERS */
-
 /*
  * ia64_mca_cmc_vector_setup
  *
@@ -630,6 +624,32 @@ copy_reg(const u64 *fr, u64 fnat, u64 *tr, u64 *tnat)
 	*tnat |= (nat << tslot);
 }
 
+/* Change the comm field on the MCA/INT task to include the pid that
+ * was interrupted, it makes for easier debugging.  If that pid was 0
+ * (swapper or nested MCA/INIT) then use the start of the previous comm
+ * field suffixed with its cpu.
+ */
+
+static void
+ia64_mca_modify_comm(const task_t *previous_current)
+{
+	char *p, comm[sizeof(current->comm)];
+	if (previous_current->pid)
+		snprintf(comm, sizeof(comm), "%s %d",
+			current->comm, previous_current->pid);
+	else {
+		int l;
+		if ((p = strchr(previous_current->comm, ' ')))
+			l = p - previous_current->comm;
+		else
+			l = strlen(previous_current->comm);
+		snprintf(comm, sizeof(comm), "%s %*s %d",
+			current->comm, l, previous_current->comm,
+			task_thread_info(previous_current)->cpu);
+	}
+	memcpy(current->comm, comm, sizeof(current->comm));
+}
+
 /* On entry to this routine, we are running on the per cpu stack, see
  * mca_asm.h.  The original stack has not been touched by this event.  Some of
  * the original stack's registers will be in the RBS on this stack.  This stack
@@ -648,7 +668,7 @@ ia64_mca_modify_original_stack(struct pt_regs *regs,
 		struct ia64_sal_os_state *sos,
 		const char *type)
 {
-	char *p, comm[sizeof(current->comm)];
+	char *p;
 	ia64_va va;
 	extern char ia64_leave_kernel[];	/* Need asm address, not function descriptor */
 	const pal_min_state_area_t *ms = sos->pal_min_state;
@@ -721,6 +741,10 @@ ia64_mca_modify_original_stack(struct pt_regs *regs,
 	/* Verify the previous stack state before we change it */
 	if (user_mode(regs)) {
 		msg = "occurred in user space";
+		/* previous_current is guaranteed to be valid when the task was
+		 * in user space, so ...
+		 */
+		ia64_mca_modify_comm(previous_current);
 		goto no_mod;
 	}
 	if (r13 != sos->prev_IA64_KR_CURRENT) {
@@ -750,25 +774,7 @@ ia64_mca_modify_original_stack(struct pt_regs *regs,
 		goto no_mod;
 	}
 
-	/* Change the comm field on the MCA/INT task to include the pid that
-	 * was interrupted, it makes for easier debugging.  If that pid was 0
-	 * (swapper or nested MCA/INIT) then use the start of the previous comm
-	 * field suffixed with its cpu.
-	 */
-	if (previous_current->pid)
-		snprintf(comm, sizeof(comm), "%s %d",
-			current->comm, previous_current->pid);
-	else {
-		int l;
-		if ((p = strchr(previous_current->comm, ' ')))
-			l = p - previous_current->comm;
-		else
-			l = strlen(previous_current->comm);
-		snprintf(comm, sizeof(comm), "%s %*s %d",
-			current->comm, l, previous_current->comm,
-			task_thread_info(previous_current)->cpu);
-	}
-	memcpy(current->comm, comm, sizeof(current->comm));
+	ia64_mca_modify_comm(previous_current);
 
 	/* Make the original task look blocked.  First stack a struct pt_regs,
 	 * describing the state at the time of interrupt.  mca_asm.S built a
@@ -908,7 +914,7 @@ no_mod:
 static void
 ia64_wait_for_slaves(int monarch)
 {
-	int c, wait = 0;
+	int c, wait = 0, missing = 0;
 	for_each_online_cpu(c) {
 		if (c == monarch)
 			continue;
@@ -919,15 +925,32 @@ ia64_wait_for_slaves(int monarch)
 		}
 	}
 	if (!wait)
-		return;
+		goto all_in;
 	for_each_online_cpu(c) {
 		if (c == monarch)
 			continue;
 		if (ia64_mc_info.imi_rendez_checkin[c] == IA64_MCA_RENDEZ_CHECKIN_NOTDONE) {
 			udelay(5*1000000);	/* wait 5 seconds for slaves (arbitrary) */
+			if (ia64_mc_info.imi_rendez_checkin[c] == IA64_MCA_RENDEZ_CHECKIN_NOTDONE)
+				missing = 1;
 			break;
 		}
 	}
+	if (!missing)
+		goto all_in;
+	printk(KERN_INFO "OS MCA slave did not rendezvous on cpu");
+	for_each_online_cpu(c) {
+		if (c == monarch)
+			continue;
+		if (ia64_mc_info.imi_rendez_checkin[c] == IA64_MCA_RENDEZ_CHECKIN_NOTDONE)
+			printk(" %d", c);
+	}
+	printk("\n");
+	return;
+
+all_in:
+	printk(KERN_INFO "All OS MCA slaves have reached rendezvous\n");
+	return;
 }
 
 /*
@@ -953,6 +976,10 @@ ia64_mca_handler(struct pt_regs *regs, struct switch_stack *sw,
 	task_t *previous_current;
 
 	oops_in_progress = 1;	/* FIXME: make printk NMI/MCA/INIT safe */
+	console_loglevel = 15;	/* make sure printks make it to console */
+	printk(KERN_INFO "Entered OS MCA handler. PSP=%lx cpu=%d monarch=%ld\n",
+		sos->proc_state_param, cpu, sos->monarch);
+
 	previous_current = ia64_mca_modify_original_stack(regs, sw, sos, "MCA");
 	monarch_cpu = cpu;
 	if (notify_die(DIE_MCA_MONARCH_ENTER, "MCA", regs, 0, 0, 0)
@@ -1444,11 +1471,13 @@ void __devinit
 ia64_mca_cpu_init(void *cpu_data)
 {
 	void *pal_vaddr;
+	static int first_time = 1;
 
-	if (smp_processor_id() == 0) {
+	if (first_time) {
 		void *mca_data;
 		int cpu;
 
+		first_time = 0;
 		mca_data = alloc_bootmem(sizeof(struct ia64_mca_cpu)
 					 * NR_CPUS + KERNEL_STACK_SIZE);
 		mca_data = (void *)(((unsigned long)mca_data +
@@ -1704,6 +1733,7 @@ ia64_mca_late_init(void)
 					desc = irq_descp(irq);
 					desc->status |= IRQ_PER_CPU;
 					setup_irq(irq, &mca_cpe_irqaction);
+					ia64_cpe_irq = irq;
 				}
 			ia64_mca_register_cpev(cpe_vector);
 			IA64_MCA_DEBUG("%s: CPEI/P setup and enabled.\n", __FUNCTION__);
diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c
index 9c5194b385d..077f21216b6 100644
--- a/arch/ia64/kernel/perfmon.c
+++ b/arch/ia64/kernel/perfmon.c
@@ -6722,6 +6722,7 @@ __initcall(pfm_init);
 void
 pfm_init_percpu (void)
 {
+	static int first_time=1;
 	/*
 	 * make sure no measurement is active
 	 * (may inherit programmed PMCs from EFI).
@@ -6734,8 +6735,10 @@ pfm_init_percpu (void)
 	 */
 	pfm_unfreeze_pmu();
 
-	if (smp_processor_id() == 0)
+	if (first_time) {
 		register_percpu_irq(IA64_PERFMON_VECTOR, &perfmon_irqaction);
+		first_time=0;
+	}
 
 	ia64_setreg(_IA64_REG_CR_PMV, IA64_PERFMON_VECTOR);
 	ia64_srlz_d();
diff --git a/arch/ia64/kernel/signal.c b/arch/ia64/kernel/signal.c
index 463f6bb44d0..1d7903ee212 100644
--- a/arch/ia64/kernel/signal.c
+++ b/arch/ia64/kernel/signal.c
@@ -588,104 +588,3 @@ ia64_do_signal (sigset_t *oldset, struct sigscratch *scr, long in_syscall)
 	}
 	return 0;
 }
-
-/* Set a delayed signal that was detected in MCA/INIT/NMI/PMI context where it
- * could not be delivered.  It is important that the target process is not
- * allowed to do any more work in user space.  Possible cases for the target
- * process:
- *
- * - It is sleeping and will wake up soon.  Store the data in the current task,
- *   the signal will be sent when the current task returns from the next
- *   interrupt.
- *
- * - It is running in user context.  Store the data in the current task, the
- *   signal will be sent when the current task returns from the next interrupt.
- *
- * - It is running in kernel context on this or another cpu and will return to
- *   user context.  Store the data in the target task, the signal will be sent
- *   to itself when the target task returns to user space.
- *
- * - It is running in kernel context on this cpu and will sleep before
- *   returning to user context.  Because this is also the current task, the
- *   signal will not get delivered and the task could sleep indefinitely.
- *   Store the data in the idle task for this cpu, the signal will be sent
- *   after the idle task processes its next interrupt.
- *
- * To cover all cases, store the data in the target task, the current task and
- * the idle task on this cpu.  Whatever happens, the signal will be delivered
- * to the target task before it can do any useful user space work.  Multiple
- * deliveries have no unwanted side effects.
- *
- * Note: This code is executed in MCA/INIT/NMI/PMI context, with interrupts
- * disabled.  It must not take any locks nor use kernel structures or services
- * that require locks.
- */
-
-/* To ensure that we get the right pid, check its start time.  To avoid extra
- * include files in thread_info.h, convert the task start_time to unsigned long,
- * giving us a cycle time of > 580 years.
- */
-static inline unsigned long
-start_time_ul(const struct task_struct *t)
-{
-	return t->start_time.tv_sec * NSEC_PER_SEC + t->start_time.tv_nsec;
-}
-
-void
-set_sigdelayed(pid_t pid, int signo, int code, void __user *addr)
-{
-	struct task_struct *t;
-	unsigned long start_time =  0;
-	int i;
-
-	for (i = 1; i <= 3; ++i) {
-		switch (i) {
-		case 1:
-			t = find_task_by_pid(pid);
-			if (t)
-				start_time = start_time_ul(t);
-			break;
-		case 2:
-			t = current;
-			break;
-		default:
-			t = idle_task(smp_processor_id());
-			break;
-		}
-
-		if (!t)
-			return;
-		task_thread_info(t)->sigdelayed.signo = signo;
-		task_thread_info(t)->sigdelayed.code = code;
-		task_thread_info(t)->sigdelayed.addr = addr;
-		task_thread_info(t)->sigdelayed.start_time = start_time;
-		task_thread_info(t)->sigdelayed.pid = pid;
-		wmb();
-		set_tsk_thread_flag(t, TIF_SIGDELAYED);
-	}
-}
-
-/* Called from entry.S when it detects TIF_SIGDELAYED, a delayed signal that
- * was detected in MCA/INIT/NMI/PMI context where it could not be delivered.
- */
-
-void
-do_sigdelayed(void)
-{
-	struct siginfo siginfo;
-	pid_t pid;
-	struct task_struct *t;
-
-	clear_thread_flag(TIF_SIGDELAYED);
-	memset(&siginfo, 0, sizeof(siginfo));
-	siginfo.si_signo = current_thread_info()->sigdelayed.signo;
-	siginfo.si_code = current_thread_info()->sigdelayed.code;
-	siginfo.si_addr = current_thread_info()->sigdelayed.addr;
-	pid = current_thread_info()->sigdelayed.pid;
-	t = find_task_by_pid(pid);
-	if (!t)
-		return;
-	if (current_thread_info()->sigdelayed.start_time != start_time_ul(t))
-		return;
-	force_sig_info(siginfo.si_signo, &siginfo, t);
-}
diff --git a/arch/ia64/kernel/smpboot.c b/arch/ia64/kernel/smpboot.c
index b681ef34a86..c4b633b36da 100644
--- a/arch/ia64/kernel/smpboot.c
+++ b/arch/ia64/kernel/smpboot.c
@@ -70,6 +70,12 @@
 #endif
 
 #ifdef CONFIG_HOTPLUG_CPU
+#ifdef CONFIG_PERMIT_BSP_REMOVE
+#define bsp_remove_ok	1
+#else
+#define bsp_remove_ok	0
+#endif
+
 /*
  * Store all idle threads, this can be reused instead of creating
  * a new thread. Also avoids complicated thread destroy functionality
@@ -104,7 +110,7 @@ struct sal_to_os_boot *sal_state_for_booting_cpu = &sal_boot_rendez_state[0];
 /*
  * ITC synchronization related stuff:
  */
-#define MASTER	0
+#define MASTER	(0)
 #define SLAVE	(SMP_CACHE_BYTES/8)
 
 #define NUM_ROUNDS	64	/* magic value */
@@ -151,6 +157,27 @@ char __initdata no_int_routing;
 
 unsigned char smp_int_redirect; /* are INT and IPI redirectable by the chipset? */
 
+#ifdef CONFIG_FORCE_CPEI_RETARGET
+#define CPEI_OVERRIDE_DEFAULT	(1)
+#else
+#define CPEI_OVERRIDE_DEFAULT	(0)
+#endif
+
+unsigned int force_cpei_retarget = CPEI_OVERRIDE_DEFAULT;
+
+static int __init
+cmdl_force_cpei(char *str)
+{
+	int value=0;
+
+	get_option (&str, &value);
+	force_cpei_retarget = value;
+
+	return 1;
+}
+
+__setup("force_cpei=", cmdl_force_cpei);
+
 static int __init
 nointroute (char *str)
 {
@@ -161,6 +188,27 @@ nointroute (char *str)
 
 __setup("nointroute", nointroute);
 
+static void fix_b0_for_bsp(void)
+{
+#ifdef CONFIG_HOTPLUG_CPU
+	int cpuid;
+	static int fix_bsp_b0 = 1;
+
+	cpuid = smp_processor_id();
+
+	/*
+	 * Cache the b0 value on the first AP that comes up
+	 */
+	if (!(fix_bsp_b0 && cpuid))
+		return;
+
+	sal_boot_rendez_state[0].br[0] = sal_boot_rendez_state[cpuid].br[0];
+	printk ("Fixed BSP b0 value from CPU %d\n", cpuid);
+
+	fix_bsp_b0 = 0;
+#endif
+}
+
 void
 sync_master (void *arg)
 {
@@ -327,8 +375,9 @@ smp_setup_percpu_timer (void)
 static void __devinit
 smp_callin (void)
 {
-	int cpuid, phys_id;
+	int cpuid, phys_id, itc_master;
 	extern void ia64_init_itm(void);
+	extern volatile int time_keeper_id;
 
 #ifdef CONFIG_PERFMON
 	extern void pfm_init_percpu(void);
@@ -336,6 +385,7 @@ smp_callin (void)
 
 	cpuid = smp_processor_id();
 	phys_id = hard_smp_processor_id();
+	itc_master = time_keeper_id;
 
 	if (cpu_online(cpuid)) {
 		printk(KERN_ERR "huh, phys CPU#0x%x, CPU#0x%x already present??\n",
@@ -343,6 +393,8 @@ smp_callin (void)
 		BUG();
 	}
 
+	fix_b0_for_bsp();
+
 	lock_ipi_calllock();
 	cpu_set(cpuid, cpu_online_map);
 	unlock_ipi_calllock();
@@ -365,8 +417,8 @@ smp_callin (void)
 		 * calls spin_unlock_bh(), which calls spin_unlock_bh(), which calls
 		 * local_bh_enable(), which bugs out if irqs are not enabled...
 		 */
-		Dprintk("Going to syncup ITC with BP.\n");
-		ia64_sync_itc(0);
+		Dprintk("Going to syncup ITC with ITC Master.\n");
+		ia64_sync_itc(itc_master);
 	}
 
 	/*
@@ -635,6 +687,47 @@ remove_siblinginfo(int cpu)
 }
 
 extern void fixup_irqs(void);
+
+int migrate_platform_irqs(unsigned int cpu)
+{
+	int new_cpei_cpu;
+	irq_desc_t *desc = NULL;
+	cpumask_t 	mask;
+	int 		retval = 0;
+
+	/*
+	 * dont permit CPEI target to removed.
+	 */
+	if (cpe_vector > 0 && is_cpu_cpei_target(cpu)) {
+		printk ("CPU (%d) is CPEI Target\n", cpu);
+		if (can_cpei_retarget()) {
+			/*
+			 * Now re-target the CPEI to a different processor
+			 */
+			new_cpei_cpu = any_online_cpu(cpu_online_map);
+			mask = cpumask_of_cpu(new_cpei_cpu);
+			set_cpei_target_cpu(new_cpei_cpu);
+			desc = irq_descp(ia64_cpe_irq);
+			/*
+			 * Switch for now, immediatly, we need to do fake intr
+			 * as other interrupts, but need to study CPEI behaviour with
+			 * polling before making changes.
+			 */
+			if (desc) {
+				desc->handler->disable(ia64_cpe_irq);
+				desc->handler->set_affinity(ia64_cpe_irq, mask);
+				desc->handler->enable(ia64_cpe_irq);
+				printk ("Re-targetting CPEI to cpu %d\n", new_cpei_cpu);
+			}
+		}
+		if (!desc) {
+			printk ("Unable to retarget CPEI, offline cpu [%d] failed\n", cpu);
+			retval = -EBUSY;
+		}
+	}
+	return retval;
+}
+
 /* must be called with cpucontrol mutex held */
 int __cpu_disable(void)
 {
@@ -643,8 +736,17 @@ int __cpu_disable(void)
 	/*
 	 * dont permit boot processor for now
 	 */
-	if (cpu == 0)
-		return -EBUSY;
+	if (cpu == 0 && !bsp_remove_ok) {
+		printk ("Your platform does not support removal of BSP\n");
+		return (-EBUSY);
+	}
+
+	cpu_clear(cpu, cpu_online_map);
+
+	if (migrate_platform_irqs(cpu)) {
+		cpu_set(cpu, cpu_online_map);
+		return (-EBUSY);
+	}
 
 	remove_siblinginfo(cpu);
 	cpu_clear(cpu, cpu_online_map);
diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c
index 307d01e15b2..ac167436e93 100644
--- a/arch/ia64/kernel/time.c
+++ b/arch/ia64/kernel/time.c
@@ -32,7 +32,7 @@
 
 extern unsigned long wall_jiffies;
 
-#define TIME_KEEPER_ID	0	/* smp_processor_id() of time-keeper */
+volatile int time_keeper_id = 0; /* smp_processor_id() of time-keeper */
 
 #ifdef CONFIG_IA64_DEBUG_IRQ
 
@@ -71,7 +71,7 @@ timer_interrupt (int irq, void *dev_id, struct pt_regs *regs)
 
 		new_itm += local_cpu_data->itm_delta;
 
-		if (smp_processor_id() == TIME_KEEPER_ID) {
+		if (smp_processor_id() == time_keeper_id) {
 			/*
 			 * Here we are in the timer irq handler. We have irqs locally
 			 * disabled, but we don't know if the timer_bh is running on
@@ -236,6 +236,11 @@ static struct irqaction timer_irqaction = {
 	.name =		"timer"
 };
 
+void __devinit ia64_disable_timer(void)
+{
+	ia64_set_itv(1 << 16);
+}
+
 void __init
 time_init (void)
 {
diff --git a/arch/ia64/kernel/topology.c b/arch/ia64/kernel/topology.c
index 6e5eea19fa6..3b6fd798c4d 100644
--- a/arch/ia64/kernel/topology.c
+++ b/arch/ia64/kernel/topology.c
@@ -36,7 +36,7 @@ int arch_register_cpu(int num)
 	parent = &sysfs_nodes[cpu_to_node(num)];
 #endif /* CONFIG_NUMA */
 
-#ifdef CONFIG_ACPI
+#if defined (CONFIG_ACPI) && defined (CONFIG_HOTPLUG_CPU)
 	/*
 	 * If CPEI cannot be re-targetted, and this is
 	 * CPEI target, then dont create the control file
diff --git a/arch/ia64/mm/contig.c b/arch/ia64/mm/contig.c
index acaaec4e468..9855ba31809 100644
--- a/arch/ia64/mm/contig.c
+++ b/arch/ia64/mm/contig.c
@@ -181,13 +181,15 @@ per_cpu_init (void)
 {
 	void *cpu_data;
 	int cpu;
+	static int first_time=1;
 
 	/*
 	 * get_free_pages() cannot be used before cpu_init() done.  BSP
 	 * allocates "NR_CPUS" pages for all CPUs to avoid that AP calls
 	 * get_zeroed_page().
 	 */
-	if (smp_processor_id() == 0) {
+	if (first_time) {
+		first_time=0;
 		cpu_data = __alloc_bootmem(PERCPU_PAGE_SIZE * NR_CPUS,
 					   PERCPU_PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
 		for (cpu = 0; cpu < NR_CPUS; cpu++) {
diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c
index c87d6d1d581..573d5cc63e2 100644
--- a/arch/ia64/mm/discontig.c
+++ b/arch/ia64/mm/discontig.c
@@ -528,12 +528,17 @@ void __init find_memory(void)
 void *per_cpu_init(void)
 {
 	int cpu;
+	static int first_time = 1;
+
 
 	if (smp_processor_id() != 0)
 		return __per_cpu_start + __per_cpu_offset[smp_processor_id()];
 
-	for (cpu = 0; cpu < NR_CPUS; cpu++)
-		per_cpu(local_per_cpu_offset, cpu) = __per_cpu_offset[cpu];
+	if (first_time) {
+		first_time = 0;
+		for (cpu = 0; cpu < NR_CPUS; cpu++)
+			per_cpu(local_per_cpu_offset, cpu) = __per_cpu_offset[cpu];
+	}
 
 	return __per_cpu_start + __per_cpu_offset[smp_processor_id()];
 }
diff --git a/arch/ia64/mm/hugetlbpage.c b/arch/ia64/mm/hugetlbpage.c
index 2d13889d0a9..9dbc7dadd16 100644
--- a/arch/ia64/mm/hugetlbpage.c
+++ b/arch/ia64/mm/hugetlbpage.c
@@ -68,9 +68,10 @@ huge_pte_offset (struct mm_struct *mm, unsigned long addr)
 #define mk_pte_huge(entry) { pte_val(entry) |= _PAGE_P; }
 
 /*
- * This function checks for proper alignment of input addr and len parameters.
+ * Don't actually need to do any preparation, but need to make sure
+ * the address is in the right region.
  */
-int is_aligned_hugepage_range(unsigned long addr, unsigned long len)
+int prepare_hugepage_range(unsigned long addr, unsigned long len)
 {
 	if (len & ~HPAGE_MASK)
 		return -EINVAL;
diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
index b38b6d213c1..08d94e6bfa1 100644
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -197,7 +197,7 @@ free_initmem (void)
 	eaddr = (unsigned long) ia64_imva(__init_end);
 	while (addr < eaddr) {
 		ClearPageReserved(virt_to_page(addr));
-		set_page_count(virt_to_page(addr), 1);
+		init_page_count(virt_to_page(addr));
 		free_page(addr);
 		++totalram_pages;
 		addr += PAGE_SIZE;
@@ -252,7 +252,7 @@ free_initrd_mem (unsigned long start, unsigned long end)
 			continue;
 		page = virt_to_page(start);
 		ClearPageReserved(page);
-		set_page_count(page, 1);
+		init_page_count(page);
 		free_page(start);
 		++totalram_pages;
 	}
@@ -640,7 +640,7 @@ mem_init (void)
 void online_page(struct page *page)
 {
 	ClearPageReserved(page);
-	set_page_count(page, 1);
+	init_page_count(page);
 	__free_page(page);
 	totalram_pages++;
 	num_physpages++;
diff --git a/arch/ia64/sn/kernel/Makefile b/arch/ia64/sn/kernel/Makefile
index 3e9b4eea741..ab9c48c8801 100644
--- a/arch/ia64/sn/kernel/Makefile
+++ b/arch/ia64/sn/kernel/Makefile
@@ -10,7 +10,8 @@
 CPPFLAGS += -I$(srctree)/arch/ia64/sn/include
 
 obj-y				+= setup.o bte.o bte_error.o irq.o mca.o idle.o \
-				   huberror.o io_init.o iomv.o klconflib.o sn2/
+				   huberror.o io_init.o iomv.o klconflib.o pio_phys.o \
+				   sn2/
 obj-$(CONFIG_IA64_GENERIC)      += machvec.o
 obj-$(CONFIG_SGI_TIOCX)		+= tiocx.o
 obj-$(CONFIG_IA64_SGI_SN_XP)	+= xp.o
diff --git a/arch/ia64/sn/kernel/pio_phys.S b/arch/ia64/sn/kernel/pio_phys.S
new file mode 100644
index 00000000000..3c7d48d6ecb
--- /dev/null
+++ b/arch/ia64/sn/kernel/pio_phys.S
@@ -0,0 +1,71 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (C) 2000-2005 Silicon Graphics, Inc. All rights reserved.
+ *
+ * This file contains macros used to access MMR registers via
+ * uncached physical addresses.
+ *      pio_phys_read_mmr  - read an MMR
+ *      pio_phys_write_mmr - write an MMR
+ *      pio_atomic_phys_write_mmrs - atomically write 1 or 2 MMRs with psr.ic=0
+ *              Second MMR will be skipped if address is NULL
+ *
+ * Addresses passed to these routines should be uncached physical addresses
+ * 	ie., 0x80000....
+ */
+
+
+
+#include <asm/asmmacro.h>
+#include <asm/page.h>
+
+GLOBAL_ENTRY(pio_phys_read_mmr)
+	.prologue
+	.regstk 1,0,0,0
+	.body
+	mov r2=psr
+	rsm psr.i | psr.dt
+	;;
+	srlz.d
+	ld8.acq r8=[r32]
+	;;
+	mov psr.l=r2;;
+	srlz.d
+	br.ret.sptk.many rp
+END(pio_phys_read_mmr)
+
+GLOBAL_ENTRY(pio_phys_write_mmr)
+	.prologue
+	.regstk 2,0,0,0
+	.body
+	mov r2=psr
+	rsm psr.i | psr.dt
+	;;
+	srlz.d
+	st8.rel [r32]=r33
+	;;
+	mov psr.l=r2;;
+	srlz.d
+	br.ret.sptk.many rp
+END(pio_phys_write_mmr)
+
+GLOBAL_ENTRY(pio_atomic_phys_write_mmrs)
+	.prologue
+	.regstk 4,0,0,0
+	.body
+	mov r2=psr
+	cmp.ne p9,p0=r34,r0;
+	rsm psr.i | psr.dt | psr.ic
+	;;
+	srlz.d
+	st8.rel [r32]=r33
+(p9)	st8.rel [r34]=r35
+	;;
+	mov psr.l=r2;;
+	srlz.d
+	br.ret.sptk.many rp
+END(pio_atomic_phys_write_mmrs)
+
+
diff --git a/arch/ia64/sn/kernel/setup.c b/arch/ia64/sn/kernel/setup.c
index 5b84836c217..8b6d5c84470 100644
--- a/arch/ia64/sn/kernel/setup.c
+++ b/arch/ia64/sn/kernel/setup.c
@@ -3,7 +3,7 @@
  * License.  See the file "COPYING" in the main directory of this archive
  * for more details.
  *
- * Copyright (C) 1999,2001-2005 Silicon Graphics, Inc. All rights reserved.
+ * Copyright (C) 1999,2001-2006 Silicon Graphics, Inc. All rights reserved.
  */
 
 #include <linux/config.h>
@@ -498,6 +498,7 @@ void __init sn_setup(char **cmdline_p)
 	 * for sn.
 	 */
 	pm_power_off = ia64_sn_power_down;
+	current->thread.flags |= IA64_THREAD_MIGRATION;
 }
 
 /**
@@ -660,7 +661,8 @@ void __init sn_cpu_init(void)
 			SH2_PIO_WRITE_STATUS_1, SH2_PIO_WRITE_STATUS_3};
 		u64 *pio;
 		pio = is_shub1() ? pio1 : pio2;
-		pda->pio_write_status_addr = (volatile unsigned long *) LOCAL_MMR_ADDR(pio[slice]);
+		pda->pio_write_status_addr =
+		   (volatile unsigned long *)GLOBAL_MMR_ADDR(nasid, pio[slice]);
 		pda->pio_write_status_val = is_shub1() ? SH_PIO_WRITE_STATUS_PENDING_WRITE_COUNT_MASK : 0;
 	}
 
diff --git a/arch/ia64/sn/kernel/sn2/sn2_smp.c b/arch/ia64/sn/kernel/sn2/sn2_smp.c
index b2e1e746b47..d9d306c79f2 100644
--- a/arch/ia64/sn/kernel/sn2/sn2_smp.c
+++ b/arch/ia64/sn/kernel/sn2/sn2_smp.c
@@ -93,6 +93,27 @@ static inline unsigned long wait_piowc(void)
 	return (ws & SH_PIO_WRITE_STATUS_WRITE_DEADLOCK_MASK) != 0;
 }
 
+/**
+ * sn_migrate - SN-specific task migration actions
+ * @task: Task being migrated to new CPU
+ *
+ * SN2 PIO writes from separate CPUs are not guaranteed to arrive in order.
+ * Context switching user threads which have memory-mapped MMIO may cause
+ * PIOs to issue from seperate CPUs, thus the PIO writes must be drained
+ * from the previous CPU's Shub before execution resumes on the new CPU.
+ */
+void sn_migrate(struct task_struct *task)
+{
+	pda_t *last_pda = pdacpu(task_thread_info(task)->last_cpu);
+	volatile unsigned long *adr = last_pda->pio_write_status_addr;
+	unsigned long val = last_pda->pio_write_status_val;
+
+	/* Drain PIO writes from old CPU's Shub */
+	while (unlikely((*adr & SH_PIO_WRITE_STATUS_PENDING_WRITE_COUNT_MASK)
+			!= val))
+		cpu_relax();
+}
+
 void sn_tlb_migrate_finish(struct mm_struct *mm)
 {
 	/* flush_tlb_mm is inefficient if more than 1 users of mm */
diff --git a/arch/ia64/sn/kernel/xpc_channel.c b/arch/ia64/sn/kernel/xpc_channel.c
index cdf6856ce08..d0abddd9ffe 100644
--- a/arch/ia64/sn/kernel/xpc_channel.c
+++ b/arch/ia64/sn/kernel/xpc_channel.c
@@ -21,7 +21,6 @@
 #include <linux/sched.h>
 #include <linux/cache.h>
 #include <linux/interrupt.h>
-#include <linux/slab.h>
 #include <linux/mutex.h>
 #include <linux/completion.h>
 #include <asm/sn/bte.h>
@@ -30,6 +29,31 @@
 
 
 /*
+ * Guarantee that the kzalloc'd memory is cacheline aligned.
+ */
+static void *
+xpc_kzalloc_cacheline_aligned(size_t size, gfp_t flags, void **base)
+{
+	/* see if kzalloc will give us cachline aligned memory by default */
+	*base = kzalloc(size, flags);
+	if (*base == NULL) {
+		return NULL;
+	}
+	if ((u64) *base == L1_CACHE_ALIGN((u64) *base)) {
+		return *base;
+	}
+	kfree(*base);
+
+	/* nope, we'll have to do it ourselves */
+	*base = kzalloc(size + L1_CACHE_BYTES, flags);
+	if (*base == NULL) {
+		return NULL;
+	}
+	return (void *) L1_CACHE_ALIGN((u64) *base);
+}
+
+
+/*
  * Set up the initial values for the XPartition Communication channels.
  */
 static void
@@ -93,20 +117,19 @@ xpc_setup_infrastructure(struct xpc_partition *part)
 	 * Allocate all of the channel structures as a contiguous chunk of
 	 * memory.
 	 */
-	part->channels = kmalloc(sizeof(struct xpc_channel) * XPC_NCHANNELS,
+	part->channels = kzalloc(sizeof(struct xpc_channel) * XPC_NCHANNELS,
 								GFP_KERNEL);
 	if (part->channels == NULL) {
 		dev_err(xpc_chan, "can't get memory for channels\n");
 		return xpcNoMemory;
 	}
-	memset(part->channels, 0, sizeof(struct xpc_channel) * XPC_NCHANNELS);
 
 	part->nchannels = XPC_NCHANNELS;
 
 
 	/* allocate all the required GET/PUT values */
 
-	part->local_GPs = xpc_kmalloc_cacheline_aligned(XPC_GP_SIZE,
+	part->local_GPs = xpc_kzalloc_cacheline_aligned(XPC_GP_SIZE,
 					GFP_KERNEL, &part->local_GPs_base);
 	if (part->local_GPs == NULL) {
 		kfree(part->channels);
@@ -115,55 +138,51 @@ xpc_setup_infrastructure(struct xpc_partition *part)
 			"values\n");
 		return xpcNoMemory;
 	}
-	memset(part->local_GPs, 0, XPC_GP_SIZE);
 
-	part->remote_GPs = xpc_kmalloc_cacheline_aligned(XPC_GP_SIZE,
+	part->remote_GPs = xpc_kzalloc_cacheline_aligned(XPC_GP_SIZE,
 					GFP_KERNEL, &part->remote_GPs_base);
 	if (part->remote_GPs == NULL) {
-		kfree(part->channels);
-		part->channels = NULL;
-		kfree(part->local_GPs_base);
-		part->local_GPs = NULL;
 		dev_err(xpc_chan, "can't get memory for remote get/put "
 			"values\n");
+		kfree(part->local_GPs_base);
+		part->local_GPs = NULL;
+		kfree(part->channels);
+		part->channels = NULL;
 		return xpcNoMemory;
 	}
-	memset(part->remote_GPs, 0, XPC_GP_SIZE);
 
 
 	/* allocate all the required open and close args */
 
-	part->local_openclose_args = xpc_kmalloc_cacheline_aligned(
+	part->local_openclose_args = xpc_kzalloc_cacheline_aligned(
 					XPC_OPENCLOSE_ARGS_SIZE, GFP_KERNEL,
 					&part->local_openclose_args_base);
 	if (part->local_openclose_args == NULL) {
-		kfree(part->channels);
-		part->channels = NULL;
-		kfree(part->local_GPs_base);
-		part->local_GPs = NULL;
+		dev_err(xpc_chan, "can't get memory for local connect args\n");
 		kfree(part->remote_GPs_base);
 		part->remote_GPs = NULL;
-		dev_err(xpc_chan, "can't get memory for local connect args\n");
+		kfree(part->local_GPs_base);
+		part->local_GPs = NULL;
+		kfree(part->channels);
+		part->channels = NULL;
 		return xpcNoMemory;
 	}
-	memset(part->local_openclose_args, 0, XPC_OPENCLOSE_ARGS_SIZE);
 
-	part->remote_openclose_args = xpc_kmalloc_cacheline_aligned(
+	part->remote_openclose_args = xpc_kzalloc_cacheline_aligned(
 					XPC_OPENCLOSE_ARGS_SIZE, GFP_KERNEL,
 					&part->remote_openclose_args_base);
 	if (part->remote_openclose_args == NULL) {
-		kfree(part->channels);
-		part->channels = NULL;
-		kfree(part->local_GPs_base);
-		part->local_GPs = NULL;
-		kfree(part->remote_GPs_base);
-		part->remote_GPs = NULL;
+		dev_err(xpc_chan, "can't get memory for remote connect args\n");
 		kfree(part->local_openclose_args_base);
 		part->local_openclose_args = NULL;
-		dev_err(xpc_chan, "can't get memory for remote connect args\n");
+		kfree(part->remote_GPs_base);
+		part->remote_GPs = NULL;
+		kfree(part->local_GPs_base);
+		part->local_GPs = NULL;
+		kfree(part->channels);
+		part->channels = NULL;
 		return xpcNoMemory;
 	}
-	memset(part->remote_openclose_args, 0, XPC_OPENCLOSE_ARGS_SIZE);
 
 
 	xpc_initialize_channels(part, partid);
@@ -186,18 +205,18 @@ xpc_setup_infrastructure(struct xpc_partition *part)
 	ret = request_irq(SGI_XPC_NOTIFY, xpc_notify_IRQ_handler, SA_SHIRQ,
 				part->IPI_owner, (void *) (u64) partid);
 	if (ret != 0) {
-		kfree(part->channels);
-		part->channels = NULL;
-		kfree(part->local_GPs_base);
-		part->local_GPs = NULL;
-		kfree(part->remote_GPs_base);
-		part->remote_GPs = NULL;
-		kfree(part->local_openclose_args_base);
-		part->local_openclose_args = NULL;
-		kfree(part->remote_openclose_args_base);
-		part->remote_openclose_args = NULL;
 		dev_err(xpc_chan, "can't register NOTIFY IRQ handler, "
 			"errno=%d\n", -ret);
+		kfree(part->remote_openclose_args_base);
+		part->remote_openclose_args = NULL;
+		kfree(part->local_openclose_args_base);
+		part->local_openclose_args = NULL;
+		kfree(part->remote_GPs_base);
+		part->remote_GPs = NULL;
+		kfree(part->local_GPs_base);
+		part->local_GPs = NULL;
+		kfree(part->channels);
+		part->channels = NULL;
 		return xpcLackOfResources;
 	}
 
@@ -446,22 +465,20 @@ xpc_allocate_local_msgqueue(struct xpc_channel *ch)
 	for (nentries = ch->local_nentries; nentries > 0; nentries--) {
 
 		nbytes = nentries * ch->msg_size;
-		ch->local_msgqueue = xpc_kmalloc_cacheline_aligned(nbytes,
+		ch->local_msgqueue = xpc_kzalloc_cacheline_aligned(nbytes,
 						GFP_KERNEL,
 						&ch->local_msgqueue_base);
 		if (ch->local_msgqueue == NULL) {
 			continue;
 		}
-		memset(ch->local_msgqueue, 0, nbytes);
 
 		nbytes = nentries * sizeof(struct xpc_notify);
-		ch->notify_queue = kmalloc(nbytes, GFP_KERNEL);
+		ch->notify_queue = kzalloc(nbytes, GFP_KERNEL);
 		if (ch->notify_queue == NULL) {
 			kfree(ch->local_msgqueue_base);
 			ch->local_msgqueue = NULL;
 			continue;
 		}
-		memset(ch->notify_queue, 0, nbytes);
 
 		spin_lock_irqsave(&ch->lock, irq_flags);
 		if (nentries < ch->local_nentries) {
@@ -501,13 +518,12 @@ xpc_allocate_remote_msgqueue(struct xpc_channel *ch)
 	for (nentries = ch->remote_nentries; nentries > 0; nentries--) {
 
 		nbytes = nentries * ch->msg_size;
-		ch->remote_msgqueue = xpc_kmalloc_cacheline_aligned(nbytes,
+		ch->remote_msgqueue = xpc_kzalloc_cacheline_aligned(nbytes,
 						GFP_KERNEL,
 						&ch->remote_msgqueue_base);
 		if (ch->remote_msgqueue == NULL) {
 			continue;
 		}
-		memset(ch->remote_msgqueue, 0, nbytes);
 
 		spin_lock_irqsave(&ch->lock, irq_flags);
 		if (nentries < ch->remote_nentries) {
diff --git a/arch/ia64/sn/kernel/xpc_main.c b/arch/ia64/sn/kernel/xpc_main.c
index 8cbf1643257..99b123a6421 100644
--- a/arch/ia64/sn/kernel/xpc_main.c
+++ b/arch/ia64/sn/kernel/xpc_main.c
@@ -52,7 +52,6 @@
 #include <linux/syscalls.h>
 #include <linux/cache.h>
 #include <linux/interrupt.h>
-#include <linux/slab.h>
 #include <linux/delay.h>
 #include <linux/reboot.h>
 #include <linux/completion.h>
diff --git a/arch/ia64/sn/kernel/xpc_partition.c b/arch/ia64/sn/kernel/xpc_partition.c
index 88a730e6cfd..94211429fd0 100644
--- a/arch/ia64/sn/kernel/xpc_partition.c
+++ b/arch/ia64/sn/kernel/xpc_partition.c
@@ -81,6 +81,31 @@ char ____cacheline_aligned xpc_remote_copy_buffer[XPC_RP_HEADER_SIZE +
 
 
 /*
+ * Guarantee that the kmalloc'd memory is cacheline aligned.
+ */
+static void *
+xpc_kmalloc_cacheline_aligned(size_t size, gfp_t flags, void **base)
+{
+	/* see if kmalloc will give us cachline aligned memory by default */
+	*base = kmalloc(size, flags);
+	if (*base == NULL) {
+		return NULL;
+	}
+	if ((u64) *base == L1_CACHE_ALIGN((u64) *base)) {
+		return *base;
+	}
+	kfree(*base);
+
+	/* nope, we'll have to do it ourselves */
+	*base = kmalloc(size + L1_CACHE_BYTES, flags);
+	if (*base == NULL) {
+		return NULL;
+	}
+	return (void *) L1_CACHE_ALIGN((u64) *base);
+}
+
+
+/*
  * Given a nasid, get the physical address of the  partition's reserved page
  * for that nasid. This function returns 0 on any error.
  */
@@ -1038,13 +1063,12 @@ xpc_discovery(void)
 	remote_vars = (struct xpc_vars *) remote_rp;
 
 
-	discovered_nasids = kmalloc(sizeof(u64) * xp_nasid_mask_words,
+	discovered_nasids = kzalloc(sizeof(u64) * xp_nasid_mask_words,
 							GFP_KERNEL);
 	if (discovered_nasids == NULL) {
 		kfree(remote_rp_base);
 		return;
 	}
-	memset(discovered_nasids, 0, sizeof(u64) * xp_nasid_mask_words);
 
 	rp = (struct xpc_rsvd_page *) xpc_rsvd_page;
 
diff --git a/arch/ia64/sn/pci/tioce_provider.c b/arch/ia64/sn/pci/tioce_provider.c
index e52831ed93e..fa073cc4b56 100644
--- a/arch/ia64/sn/pci/tioce_provider.c
+++ b/arch/ia64/sn/pci/tioce_provider.c
@@ -15,6 +15,124 @@
 #include <asm/sn/pcidev.h>
 #include <asm/sn/pcibus_provider_defs.h>
 #include <asm/sn/tioce_provider.h>
+#include <asm/sn/sn2/sn_hwperf.h>
+
+/*
+ * 1/26/2006
+ *
+ * WAR for SGI PV 944642.  For revA TIOCE, need to use the following recipe
+ * (taken from the above PV) before and after accessing tioce internal MMR's
+ * to avoid tioce lockups.
+ *
+ * The recipe as taken from the PV:
+ *
+ *	if(mmr address < 0x45000) {
+ *		if(mmr address == 0 or 0x80)
+ *			mmr wrt or read address 0xc0
+ *		else if(mmr address == 0x148 or 0x200)
+ *			mmr wrt or read address 0x28
+ *		else
+ *			mmr wrt or read address 0x158
+ *
+ *		do desired mmr access (rd or wrt)
+ *
+ *		if(mmr address == 0x100)
+ *			mmr wrt or read address 0x38
+ *		mmr wrt or read address 0xb050
+ *	} else
+ *		do desired mmr access
+ *
+ * According to hw, we can use reads instead of writes to the above addres
+ *
+ * Note this WAR can only to be used for accessing internal MMR's in the
+ * TIOCE Coretalk Address Range 0x0 - 0x07ff_ffff.  This includes the
+ * "Local CE Registers and Memories" and "PCI Compatible Config Space" address
+ * spaces from table 2-1 of the "CE Programmer's Reference Overview" document.
+ *
+ * All registers defined in struct tioce will meet that criteria.
+ */
+
+static void inline
+tioce_mmr_war_pre(struct tioce_kernel *kern, void *mmr_addr)
+{
+	u64 mmr_base;
+	u64 mmr_offset;
+
+	if (kern->ce_common->ce_rev != TIOCE_REV_A)
+		return;
+
+	mmr_base = kern->ce_common->ce_pcibus.bs_base;
+	mmr_offset = (u64)mmr_addr - mmr_base;
+
+	if (mmr_offset < 0x45000) {
+		u64 mmr_war_offset;
+
+		if (mmr_offset == 0 || mmr_offset == 0x80)
+			mmr_war_offset = 0xc0;
+		else if (mmr_offset == 0x148 || mmr_offset == 0x200)
+			mmr_war_offset = 0x28;
+		else
+			mmr_war_offset = 0x158;
+
+		readq_relaxed((void *)(mmr_base + mmr_war_offset));
+	}
+}
+
+static void inline
+tioce_mmr_war_post(struct tioce_kernel *kern, void *mmr_addr)
+{
+	u64 mmr_base;
+	u64 mmr_offset;
+
+	if (kern->ce_common->ce_rev != TIOCE_REV_A)
+		return;
+
+	mmr_base = kern->ce_common->ce_pcibus.bs_base;
+	mmr_offset = (u64)mmr_addr - mmr_base;
+
+	if (mmr_offset < 0x45000) {
+		if (mmr_offset == 0x100)
+			readq_relaxed((void *)(mmr_base + 0x38));
+		readq_relaxed((void *)(mmr_base + 0xb050));
+	}
+}
+
+/* load mmr contents into a variable */
+#define tioce_mmr_load(kern, mmrp, varp) do {\
+	tioce_mmr_war_pre(kern, mmrp); \
+	*(varp) = readq_relaxed(mmrp); \
+	tioce_mmr_war_post(kern, mmrp); \
+} while (0)
+
+/* store variable contents into mmr */
+#define tioce_mmr_store(kern, mmrp, varp) do {\
+	tioce_mmr_war_pre(kern, mmrp); \
+	writeq(*varp, mmrp); \
+	tioce_mmr_war_post(kern, mmrp); \
+} while (0)
+
+/* store immediate value into mmr */
+#define tioce_mmr_storei(kern, mmrp, val) do {\
+	tioce_mmr_war_pre(kern, mmrp); \
+	writeq(val, mmrp); \
+	tioce_mmr_war_post(kern, mmrp); \
+} while (0)
+
+/* set bits (immediate value) into mmr */
+#define tioce_mmr_seti(kern, mmrp, bits) do {\
+	u64 tmp; \
+	tioce_mmr_load(kern, mmrp, &tmp); \
+	tmp |= (bits); \
+	tioce_mmr_store(kern, mmrp, &tmp); \
+} while (0)
+
+/* clear bits (immediate value) into mmr */
+#define tioce_mmr_clri(kern, mmrp, bits) do { \
+	u64 tmp; \
+	tioce_mmr_load(kern, mmrp, &tmp); \
+	tmp &= ~(bits); \
+	tioce_mmr_store(kern, mmrp, &tmp); \
+} while (0)
 
 /**
  * Bus address ranges for the 5 flavors of TIOCE DMA
@@ -62,9 +180,9 @@
 #define TIOCE_ATE_M40	2
 #define TIOCE_ATE_M40S	3
 
-#define KB(x)	((x) << 10)
-#define MB(x)	((x) << 20)
-#define GB(x)	((x) << 30)
+#define KB(x)	((u64)(x) << 10)
+#define MB(x)	((u64)(x) << 20)
+#define GB(x)	((u64)(x) << 30)
 
 /**
  * tioce_dma_d64 - create a DMA mapping using 64-bit direct mode
@@ -151,7 +269,7 @@ tioce_alloc_map(struct tioce_kernel *ce_kern, int type, int port,
 	int last;
 	int entries;
 	int nates;
-	int pagesize;
+	u64 pagesize;
 	u64 *ate_shadow;
 	u64 *ate_reg;
 	u64 addr;
@@ -228,7 +346,7 @@ tioce_alloc_map(struct tioce_kernel *ce_kern, int type, int port,
 
 		ate = ATE_MAKE(addr, pagesize);
 		ate_shadow[i + j] = ate;
-		writeq(ate, &ate_reg[i + j]);
+		tioce_mmr_storei(ce_kern, &ate_reg[i + j], ate);
 		addr += pagesize;
 	}
 
@@ -272,7 +390,8 @@ tioce_dma_d32(struct pci_dev *pdev, u64 ct_addr)
 		u64 tmp;
 
 		ce_kern->ce_port[port].dirmap_shadow = ct_upper;
-		writeq(ct_upper, &ce_mmr->ce_ure_dir_map[port]);
+		tioce_mmr_storei(ce_kern, &ce_mmr->ce_ure_dir_map[port],
+				 ct_upper);
 		tmp = ce_mmr->ce_ure_dir_map[port];
 		dma_ok = 1;
 	} else
@@ -344,7 +463,8 @@ tioce_dma_unmap(struct pci_dev *pdev, dma_addr_t bus_addr, int dir)
 	if (TIOCE_D32_ADDR(bus_addr)) {
 		if (--ce_kern->ce_port[port].dirmap_refcnt == 0) {
 			ce_kern->ce_port[port].dirmap_shadow = 0;
-			writeq(0, &ce_mmr->ce_ure_dir_map[port]);
+			tioce_mmr_storei(ce_kern, &ce_mmr->ce_ure_dir_map[port],
+					 0);
 		}
 	} else {
 		struct tioce_dmamap *map;
@@ -365,7 +485,7 @@ tioce_dma_unmap(struct pci_dev *pdev, dma_addr_t bus_addr, int dir)
 		} else if (--map->refcnt == 0) {
 			for (i = 0; i < map->ate_count; i++) {
 				map->ate_shadow[i] = 0;
-				map->ate_hw[i] = 0;
+				tioce_mmr_storei(ce_kern, &map->ate_hw[i], 0);
 			}
 
 			list_del(&map->ce_dmamap_list);
@@ -486,7 +606,7 @@ tioce_do_dma_map(struct pci_dev *pdev, u64 paddr, size_t byte_count,
 	spin_unlock_irqrestore(&ce_kern->ce_lock, flags);
 
 dma_map_done:
-	if (mapaddr & barrier)
+	if (mapaddr && barrier)
 		mapaddr = tioce_dma_barrier(mapaddr, 1);
 
 	return mapaddr;
@@ -541,17 +661,61 @@ tioce_error_intr_handler(int irq, void *arg, struct pt_regs *pt)
 			soft->ce_pcibus.bs_persist_segment,
 			soft->ce_pcibus.bs_persist_busnum, 0, 0, 0, 0, 0);
 
+	if (ret_stuff.v0)
+		panic("tioce_error_intr_handler:  Fatal TIOCE error");
+
 	return IRQ_HANDLED;
 }
 
 /**
+ * tioce_reserve_m32 - reserve M32 ate's for the indicated address range
+ * @tioce_kernel: TIOCE context to reserve ate's for
+ * @base: starting bus address to reserve
+ * @limit: last bus address to reserve
+ *
+ * If base/limit falls within the range of bus space mapped through the
+ * M32 space, reserve the resources corresponding to the range.
+ */
+static void
+tioce_reserve_m32(struct tioce_kernel *ce_kern, u64 base, u64 limit)
+{
+	int ate_index, last_ate, ps;
+	struct tioce *ce_mmr;
+
+	if (!TIOCE_M32_ADDR(base))
+		return;
+
+	ce_mmr = (struct tioce *)ce_kern->ce_common->ce_pcibus.bs_base;
+	ps = ce_kern->ce_ate3240_pagesize;
+	ate_index = ATE_PAGE(base, ps);
+	last_ate = ate_index + ATE_NPAGES(base, limit-base+1, ps) - 1;
+
+	if (ate_index < 64)
+		ate_index = 64;
+
+	while (ate_index <= last_ate) {
+		u64 ate;
+
+		ate = ATE_MAKE(0xdeadbeef, ps);
+		ce_kern->ce_ate3240_shadow[ate_index] = ate;
+		tioce_mmr_storei(ce_kern, &ce_mmr->ce_ure_ate3240[ate_index],
+				 ate);
+		ate_index++;
+	}
+}
+
+/**
  * tioce_kern_init - init kernel structures related to a given TIOCE
  * @tioce_common: ptr to a cached tioce_common struct that originated in prom
- */ static struct tioce_kernel *
+ */
+static struct tioce_kernel *
 tioce_kern_init(struct tioce_common *tioce_common)
 {
 	int i;
+	int ps;
+	int dev;
 	u32 tmp;
+	unsigned int seg, bus;
 	struct tioce *tioce_mmr;
 	struct tioce_kernel *tioce_kern;
 
@@ -572,9 +736,10 @@ tioce_kern_init(struct tioce_common *tioce_common)
 	 * here to use pci_read_config_xxx() so use the raw_pci_ops vector.
 	 */
 
-	raw_pci_ops->read(tioce_common->ce_pcibus.bs_persist_segment,
-			  tioce_common->ce_pcibus.bs_persist_busnum,
-			  PCI_DEVFN(2, 0), PCI_SECONDARY_BUS, 1, &tmp);
+	seg = tioce_common->ce_pcibus.bs_persist_segment;
+	bus = tioce_common->ce_pcibus.bs_persist_busnum;
+
+	raw_pci_ops->read(seg, bus, PCI_DEVFN(2, 0), PCI_SECONDARY_BUS, 1,&tmp);
 	tioce_kern->ce_port1_secondary = (u8) tmp;
 
 	/*
@@ -583,18 +748,76 @@ tioce_kern_init(struct tioce_common *tioce_common)
 	 */
 
 	tioce_mmr = (struct tioce *)tioce_common->ce_pcibus.bs_base;
-	__sn_clrq_relaxed(&tioce_mmr->ce_ure_page_map, CE_URE_PAGESIZE_MASK);
-	__sn_setq_relaxed(&tioce_mmr->ce_ure_page_map, CE_URE_256K_PAGESIZE);
-	tioce_kern->ce_ate3240_pagesize = KB(256);
+	tioce_mmr_clri(tioce_kern, &tioce_mmr->ce_ure_page_map,
+		       CE_URE_PAGESIZE_MASK);
+	tioce_mmr_seti(tioce_kern, &tioce_mmr->ce_ure_page_map,
+		       CE_URE_256K_PAGESIZE);
+	ps = tioce_kern->ce_ate3240_pagesize = KB(256);
 
 	for (i = 0; i < TIOCE_NUM_M40_ATES; i++) {
 		tioce_kern->ce_ate40_shadow[i] = 0;
-		writeq(0, &tioce_mmr->ce_ure_ate40[i]);
+		tioce_mmr_storei(tioce_kern, &tioce_mmr->ce_ure_ate40[i], 0);
 	}
 
 	for (i = 0; i < TIOCE_NUM_M3240_ATES; i++) {
 		tioce_kern->ce_ate3240_shadow[i] = 0;
-		writeq(0, &tioce_mmr->ce_ure_ate3240[i]);
+		tioce_mmr_storei(tioce_kern, &tioce_mmr->ce_ure_ate3240[i], 0);
+	}
+
+	/*
+	 * Reserve ATE's corresponding to reserved address ranges.  These
+	 * include:
+	 *
+	 *	Memory space covered by each PPB mem base/limit register
+	 * 	Memory space covered by each PPB prefetch base/limit register
+	 *
+	 * These bus ranges are for pio (downstream) traffic only, and so
+	 * cannot be used for DMA.
+	 */
+
+	for (dev = 1; dev <= 2; dev++) {
+		u64 base, limit;
+
+		/* mem base/limit */
+
+		raw_pci_ops->read(seg, bus, PCI_DEVFN(dev, 0),
+				  PCI_MEMORY_BASE, 2, &tmp);
+		base = (u64)tmp << 16;
+
+		raw_pci_ops->read(seg, bus, PCI_DEVFN(dev, 0),
+				  PCI_MEMORY_LIMIT, 2, &tmp);
+		limit = (u64)tmp << 16;
+		limit |= 0xfffffUL;
+
+		if (base < limit)
+			tioce_reserve_m32(tioce_kern, base, limit);
+
+		/*
+		 * prefetch mem base/limit.  The tioce ppb's have 64-bit
+		 * decoders, so read the upper portions w/o checking the
+		 * attributes.
+		 */
+
+		raw_pci_ops->read(seg, bus, PCI_DEVFN(dev, 0),
+				  PCI_PREF_MEMORY_BASE, 2, &tmp);
+		base = ((u64)tmp & PCI_PREF_RANGE_MASK) << 16;
+
+		raw_pci_ops->read(seg, bus, PCI_DEVFN(dev, 0),
+				  PCI_PREF_BASE_UPPER32, 4, &tmp);
+		base |= (u64)tmp << 32;
+
+		raw_pci_ops->read(seg, bus, PCI_DEVFN(dev, 0),
+				  PCI_PREF_MEMORY_LIMIT, 2, &tmp);
+
+		limit = ((u64)tmp & PCI_PREF_RANGE_MASK) << 16;
+		limit |= 0xfffffUL;
+
+		raw_pci_ops->read(seg, bus, PCI_DEVFN(dev, 0),
+				  PCI_PREF_LIMIT_UPPER32, 4, &tmp);
+		limit |= (u64)tmp << 32;
+
+		if ((base < limit) && TIOCE_M32_ADDR(base))
+			tioce_reserve_m32(tioce_kern, base, limit);
 	}
 
 	return tioce_kern;
@@ -614,6 +837,7 @@ tioce_force_interrupt(struct sn_irq_info *sn_irq_info)
 {
 	struct pcidev_info *pcidev_info;
 	struct tioce_common *ce_common;
+	struct tioce_kernel *ce_kern;
 	struct tioce *ce_mmr;
 	u64 force_int_val;
 
@@ -629,6 +853,29 @@ tioce_force_interrupt(struct sn_irq_info *sn_irq_info)
 
 	ce_common = (struct tioce_common *)pcidev_info->pdi_pcibus_info;
 	ce_mmr = (struct tioce *)ce_common->ce_pcibus.bs_base;
+	ce_kern = (struct tioce_kernel *)ce_common->ce_kernel_private;
+
+	/*
+	 * TIOCE Rev A workaround (PV 945826), force an interrupt by writing
+	 * the TIO_INTx register directly (1/26/2006)
+	 */
+	if (ce_common->ce_rev == TIOCE_REV_A) {
+		u64 int_bit_mask = (1ULL << sn_irq_info->irq_int_bit);
+		u64 status;
+
+		tioce_mmr_load(ce_kern, &ce_mmr->ce_adm_int_status, &status);
+		if (status & int_bit_mask) {
+			u64 force_irq = (1 << 8) | sn_irq_info->irq_irq;
+			u64 ctalk = sn_irq_info->irq_xtalkaddr;
+			u64 nasid, offset;
+
+			nasid = (ctalk & CTALK_NASID_MASK) >> CTALK_NASID_SHFT;
+			offset = (ctalk & CTALK_NODE_OFFSET);
+			HUB_S(TIO_IOSPACE_ADDR(nasid, offset), force_irq);
+		}
+
+		return;
+	}
 
 	/*
 	 * irq_int_bit is originally set up by prom, and holds the interrupt
@@ -666,7 +913,7 @@ tioce_force_interrupt(struct sn_irq_info *sn_irq_info)
 	default:
 		return;
 	}
-	writeq(force_int_val, &ce_mmr->ce_adm_force_int);
+	tioce_mmr_storei(ce_kern, &ce_mmr->ce_adm_force_int, force_int_val);
 }
 
 /**
@@ -685,6 +932,7 @@ tioce_target_interrupt(struct sn_irq_info *sn_irq_info)
 {
 	struct pcidev_info *pcidev_info;
 	struct tioce_common *ce_common;
+	struct tioce_kernel *ce_kern;
 	struct tioce *ce_mmr;
 	int bit;
 	u64 vector;
@@ -695,14 +943,15 @@ tioce_target_interrupt(struct sn_irq_info *sn_irq_info)
 
 	ce_common = (struct tioce_common *)pcidev_info->pdi_pcibus_info;
 	ce_mmr = (struct tioce *)ce_common->ce_pcibus.bs_base;
+	ce_kern = (struct tioce_kernel *)ce_common->ce_kernel_private;
 
 	bit = sn_irq_info->irq_int_bit;
 
-	__sn_setq_relaxed(&ce_mmr->ce_adm_int_mask, (1UL << bit));
+	tioce_mmr_seti(ce_kern, &ce_mmr->ce_adm_int_mask, (1UL << bit));
 	vector = (u64)sn_irq_info->irq_irq << INTR_VECTOR_SHFT;
 	vector |= sn_irq_info->irq_xtalkaddr;
-	writeq(vector, &ce_mmr->ce_adm_int_dest[bit]);
-	__sn_clrq_relaxed(&ce_mmr->ce_adm_int_mask, (1UL << bit));
+	tioce_mmr_storei(ce_kern, &ce_mmr->ce_adm_int_dest[bit], vector);
+	tioce_mmr_clri(ce_kern, &ce_mmr->ce_adm_int_mask, (1UL << bit));
 
 	tioce_force_interrupt(sn_irq_info);
 }
@@ -721,7 +970,11 @@ tioce_target_interrupt(struct sn_irq_info *sn_irq_info)
 static void *
 tioce_bus_fixup(struct pcibus_bussoft *prom_bussoft, struct pci_controller *controller)
 {
+	int my_nasid;
+	cnodeid_t my_cnode, mem_cnode;
 	struct tioce_common *tioce_common;
+	struct tioce_kernel *tioce_kern;
+	struct tioce *tioce_mmr;
 
 	/*
 	 * Allocate kernel bus soft and copy from prom.
@@ -734,11 +987,23 @@ tioce_bus_fixup(struct pcibus_bussoft *prom_bussoft, struct pci_controller *cont
 	memcpy(tioce_common, prom_bussoft, sizeof(struct tioce_common));
 	tioce_common->ce_pcibus.bs_base |= __IA64_UNCACHED_OFFSET;
 
-	if (tioce_kern_init(tioce_common) == NULL) {
+	tioce_kern = tioce_kern_init(tioce_common);
+	if (tioce_kern == NULL) {
 		kfree(tioce_common);
 		return NULL;
 	}
 
+	/*
+	 * Clear out any transient errors before registering the error
+	 * interrupt handler.
+	 */
+
+	tioce_mmr = (struct tioce *)tioce_common->ce_pcibus.bs_base;
+	tioce_mmr_seti(tioce_kern, &tioce_mmr->ce_adm_int_status_alias, ~0ULL);
+	tioce_mmr_seti(tioce_kern, &tioce_mmr->ce_adm_error_summary_alias,
+		       ~0ULL);
+	tioce_mmr_seti(tioce_kern, &tioce_mmr->ce_dre_comp_err_addr, ~0ULL);
+
 	if (request_irq(SGI_PCIASIC_ERROR,
 			tioce_error_intr_handler,
 			SA_SHIRQ, "TIOCE error", (void *)tioce_common))
@@ -750,6 +1015,21 @@ tioce_bus_fixup(struct pcibus_bussoft *prom_bussoft, struct pci_controller *cont
 		       tioce_common->ce_pcibus.bs_persist_segment,
 		       tioce_common->ce_pcibus.bs_persist_busnum);
 
+	/*
+	 * identify closest nasid for memory allocations
+	 */
+
+	my_nasid = NASID_GET(tioce_common->ce_pcibus.bs_base);
+	my_cnode = nasid_to_cnodeid(my_nasid);
+
+	if (sn_hwperf_get_nearest_node(my_cnode, &mem_cnode, NULL) < 0) {
+		printk(KERN_WARNING "tioce_bus_fixup: failed to find "
+		       "closest node with MEM to TIO node %d\n", my_cnode);
+		mem_cnode = (cnodeid_t)-1; /* use any node */
+	}
+
+	controller->node = mem_cnode;
+
 	return tioce_common;
 }
 
diff --git a/arch/m32r/mm/init.c b/arch/m32r/mm/init.c
index 6facf15b04f..c9e7dad860b 100644
--- a/arch/m32r/mm/init.c
+++ b/arch/m32r/mm/init.c
@@ -226,7 +226,7 @@ void free_initmem(void)
 	addr = (unsigned long)(&__init_begin);
 	for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) {
 		ClearPageReserved(virt_to_page(addr));
-		set_page_count(virt_to_page(addr), 1);
+		init_page_count(virt_to_page(addr));
 		free_page(addr);
 		totalram_pages++;
 	}
@@ -244,7 +244,7 @@ void free_initrd_mem(unsigned long start, unsigned long end)
 	unsigned long p;
 	for (p = start; p < end; p += PAGE_SIZE) {
 		ClearPageReserved(virt_to_page(p));
-		set_page_count(virt_to_page(p), 1);
+		init_page_count(virt_to_page(p));
 		free_page(p);
 		totalram_pages++;
 	}
diff --git a/arch/m68k/mm/init.c b/arch/m68k/mm/init.c
index c45beb95594..a190e39c907 100644
--- a/arch/m68k/mm/init.c
+++ b/arch/m68k/mm/init.c
@@ -137,7 +137,7 @@ void free_initrd_mem(unsigned long start, unsigned long end)
 	int pages = 0;
 	for (; start < end; start += PAGE_SIZE) {
 		ClearPageReserved(virt_to_page(start));
-		set_page_count(virt_to_page(start), 1);
+		init_page_count(virt_to_page(start));
 		free_page(start);
 		totalram_pages++;
 		pages++;
diff --git a/arch/m68k/mm/memory.c b/arch/m68k/mm/memory.c
index 559942ce0e1..d6d582a5abb 100644
--- a/arch/m68k/mm/memory.c
+++ b/arch/m68k/mm/memory.c
@@ -54,7 +54,7 @@ void __init init_pointer_table(unsigned long ptable)
 
 	/* unreserve the page so it's possible to free that page */
 	PD_PAGE(dp)->flags &= ~(1 << PG_reserved);
-	set_page_count(PD_PAGE(dp), 1);
+	init_page_count(PD_PAGE(dp));
 
 	return;
 }
diff --git a/arch/m68k/mm/motorola.c b/arch/m68k/mm/motorola.c
index d855fec2631..afb57eeafdc 100644
--- a/arch/m68k/mm/motorola.c
+++ b/arch/m68k/mm/motorola.c
@@ -276,7 +276,7 @@ void free_initmem(void)
 	addr = (unsigned long)&__init_begin;
 	for (; addr < (unsigned long)&__init_end; addr += PAGE_SIZE) {
 		virt_to_page(addr)->flags &= ~(1 << PG_reserved);
-		set_page_count(virt_to_page(addr), 1);
+		init_page_count(virt_to_page(addr));
 		free_page(addr);
 		totalram_pages++;
 	}
diff --git a/arch/m68knommu/kernel/m68k_ksyms.c b/arch/m68knommu/kernel/m68k_ksyms.c
index eddb8d3e130..d844c755945 100644
--- a/arch/m68knommu/kernel/m68k_ksyms.c
+++ b/arch/m68knommu/kernel/m68k_ksyms.c
@@ -26,6 +26,7 @@ EXPORT_SYMBOL(__ioremap);
 EXPORT_SYMBOL(iounmap);
 EXPORT_SYMBOL(dump_fpu);
 EXPORT_SYMBOL(strnlen);
+EXPORT_SYMBOL(strpbrk);
 EXPORT_SYMBOL(strrchr);
 EXPORT_SYMBOL(strstr);
 EXPORT_SYMBOL(strchr);
diff --git a/arch/m68knommu/mm/init.c b/arch/m68knommu/mm/init.c
index 89f0b554ffb..d79503fe6e4 100644
--- a/arch/m68knommu/mm/init.c
+++ b/arch/m68knommu/mm/init.c
@@ -195,7 +195,7 @@ void free_initrd_mem(unsigned long start, unsigned long end)
 	int pages = 0;
 	for (; start < end; start += PAGE_SIZE) {
 		ClearPageReserved(virt_to_page(start));
-		set_page_count(virt_to_page(start), 1);
+		init_page_count(virt_to_page(start));
 		free_page(start);
 		totalram_pages++;
 		pages++;
@@ -218,7 +218,7 @@ free_initmem()
 	/* next to check that the page we free is not a partial page */
 	for (; addr + PAGE_SIZE < (unsigned long)(&__init_end); addr +=PAGE_SIZE) {
 		ClearPageReserved(virt_to_page(addr));
-		set_page_count(virt_to_page(addr), 1);
+		init_page_count(virt_to_page(addr));
 		free_page(addr);
 		totalram_pages++;
 	}
diff --git a/arch/mips/arc/memory.c b/arch/mips/arc/memory.c
index 958d2eb7886..8a9ef58cc39 100644
--- a/arch/mips/arc/memory.c
+++ b/arch/mips/arc/memory.c
@@ -158,7 +158,7 @@ unsigned long __init prom_free_prom_memory(void)
 		while (addr < boot_mem_map.map[i].addr
 			      + boot_mem_map.map[i].size) {
 			ClearPageReserved(virt_to_page(__va(addr)));
-			set_page_count(virt_to_page(__va(addr)), 1);
+			init_page_count(virt_to_page(__va(addr)));
 			free_page((unsigned long)__va(addr));
 			addr += PAGE_SIZE;
 			freed += PAGE_SIZE;
diff --git a/arch/mips/dec/prom/memory.c b/arch/mips/dec/prom/memory.c
index 81cb5a76cfb..1edaf3074ee 100644
--- a/arch/mips/dec/prom/memory.c
+++ b/arch/mips/dec/prom/memory.c
@@ -118,7 +118,7 @@ unsigned long __init prom_free_prom_memory(void)
 	addr = PAGE_SIZE;
 	while (addr < end) {
 		ClearPageReserved(virt_to_page(__va(addr)));
-		set_page_count(virt_to_page(__va(addr)), 1);
+		init_page_count(virt_to_page(__va(addr)));
 		free_page((unsigned long)__va(addr));
 		addr += PAGE_SIZE;
 	}
diff --git a/arch/mips/mips-boards/generic/memory.c b/arch/mips/mips-boards/generic/memory.c
index 2c8afd77a20..ee5e70c95cf 100644
--- a/arch/mips/mips-boards/generic/memory.c
+++ b/arch/mips/mips-boards/generic/memory.c
@@ -174,7 +174,7 @@ unsigned long __init prom_free_prom_memory(void)
 		while (addr < boot_mem_map.map[i].addr
 			      + boot_mem_map.map[i].size) {
 			ClearPageReserved(virt_to_page(__va(addr)));
-			set_page_count(virt_to_page(__va(addr)), 1);
+			init_page_count(virt_to_page(__va(addr)));
 			free_page((unsigned long)__va(addr));
 			addr += PAGE_SIZE;
 			freed += PAGE_SIZE;
diff --git a/arch/mips/mips-boards/sim/sim_mem.c b/arch/mips/mips-boards/sim/sim_mem.c
index 0dbd7435bb2..1ec4e75656b 100644
--- a/arch/mips/mips-boards/sim/sim_mem.c
+++ b/arch/mips/mips-boards/sim/sim_mem.c
@@ -117,7 +117,7 @@ unsigned long __init prom_free_prom_memory(void)
 		while (addr < boot_mem_map.map[i].addr
 			      + boot_mem_map.map[i].size) {
 			ClearPageReserved(virt_to_page(__va(addr)));
-			set_page_count(virt_to_page(__va(addr)), 1);
+			init_page_count(virt_to_page(__va(addr)));
 			free_page((unsigned long)__va(addr));
 			addr += PAGE_SIZE;
 			freed += PAGE_SIZE;
diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c
index 0ff9a348b84..52f7d59fe61 100644
--- a/arch/mips/mm/init.c
+++ b/arch/mips/mm/init.c
@@ -54,7 +54,8 @@ unsigned long empty_zero_page, zero_page_mask;
  */
 unsigned long setup_zero_pages(void)
 {
-	unsigned long order, size;
+	unsigned int order;
+	unsigned long size;
 	struct page *page;
 
 	if (cpu_has_vce)
@@ -67,9 +68,9 @@ unsigned long setup_zero_pages(void)
 		panic("Oh boy, that early out of memory?");
 
 	page = virt_to_page(empty_zero_page);
+	split_page(page, order);
 	while (page < virt_to_page(empty_zero_page + (PAGE_SIZE << order))) {
 		SetPageReserved(page);
-		set_page_count(page, 1);
 		page++;
 	}
 
@@ -244,7 +245,7 @@ void __init mem_init(void)
 #ifdef CONFIG_LIMITED_DMA
 		set_page_address(page, lowmem_page_address(page));
 #endif
-		set_page_count(page, 1);
+		init_page_count(page);
 		__free_page(page);
 		totalhigh_pages++;
 	}
@@ -291,7 +292,7 @@ void free_initrd_mem(unsigned long start, unsigned long end)
 
 	for (; start < end; start += PAGE_SIZE) {
 		ClearPageReserved(virt_to_page(start));
-		set_page_count(virt_to_page(start), 1);
+		init_page_count(virt_to_page(start));
 		free_page(start);
 		totalram_pages++;
 	}
@@ -314,7 +315,7 @@ void free_initmem(void)
 		page = addr;
 #endif
 		ClearPageReserved(virt_to_page(page));
-		set_page_count(virt_to_page(page), 1);
+		init_page_count(virt_to_page(page));
 		free_page(page);
 		totalram_pages++;
 		freed += PAGE_SIZE;
diff --git a/arch/mips/sgi-ip27/ip27-memory.c b/arch/mips/sgi-ip27/ip27-memory.c
index ed93a979295..e0d095daa5e 100644
--- a/arch/mips/sgi-ip27/ip27-memory.c
+++ b/arch/mips/sgi-ip27/ip27-memory.c
@@ -559,7 +559,7 @@ void __init mem_init(void)
 				/* if (!page_is_ram(pgnr)) continue; */
 				/* commented out until page_is_ram works */
 				ClearPageReserved(p);
-				set_page_count(p, 1);
+				init_page_count(p);
 				__free_page(p);
 				totalram_pages++;
 			}
diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c
index 7847ca13d6c..852eda3953d 100644
--- a/arch/parisc/mm/init.c
+++ b/arch/parisc/mm/init.c
@@ -398,7 +398,7 @@ void free_initmem(void)
 	addr = (unsigned long)(&__init_begin);
 	for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) {
 		ClearPageReserved(virt_to_page(addr));
-		set_page_count(virt_to_page(addr), 1);
+		init_page_count(virt_to_page(addr));
 		free_page(addr);
 		num_physpages++;
 		totalram_pages++;
@@ -1018,7 +1018,7 @@ void free_initrd_mem(unsigned long start, unsigned long end)
 		printk(KERN_INFO "Freeing initrd memory: %ldk freed\n", (end - start) >> 10);
 	for (; start < end; start += PAGE_SIZE) {
 		ClearPageReserved(virt_to_page(start));
-		set_page_count(virt_to_page(start), 1);
+		init_page_count(virt_to_page(start));
 		free_page(start);
 		num_physpages++;
 		totalram_pages++;
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index b51bb28c054..7370f9f33e2 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -133,21 +133,6 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
 	return __pte(old);
 }
 
-/*
- * This function checks for proper alignment of input addr and len parameters.
- */
-int is_aligned_hugepage_range(unsigned long addr, unsigned long len)
-{
-	if (len & ~HPAGE_MASK)
-		return -EINVAL;
-	if (addr & ~HPAGE_MASK)
-		return -EINVAL;
-	if (! (within_hugepage_low_range(addr, len)
-	       || within_hugepage_high_range(addr, len)) )
-		return -EINVAL;
-	return 0;
-}
-
 struct slb_flush_info {
 	struct mm_struct *mm;
 	u16 newareas;
diff --git a/arch/powerpc/mm/init_32.c b/arch/powerpc/mm/init_32.c
index 7d0d75c1184..b57fb3a2b7b 100644
--- a/arch/powerpc/mm/init_32.c
+++ b/arch/powerpc/mm/init_32.c
@@ -216,7 +216,7 @@ static void free_sec(unsigned long start, unsigned long end, const char *name)
 
 	while (start < end) {
 		ClearPageReserved(virt_to_page(start));
-		set_page_count(virt_to_page(start), 1);
+		init_page_count(virt_to_page(start));
 		free_page(start);
 		cnt++;
 		start += PAGE_SIZE;
@@ -248,7 +248,7 @@ void free_initrd_mem(unsigned long start, unsigned long end)
 		printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10);
 	for (; start < end; start += PAGE_SIZE) {
 		ClearPageReserved(virt_to_page(start));
-		set_page_count(virt_to_page(start), 1);
+		init_page_count(virt_to_page(start));
 		free_page(start);
 		totalram_pages++;
 	}
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index 81cfb0c2ec5..bacb71c8981 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -140,7 +140,7 @@ void free_initmem(void)
 	for (; addr < (unsigned long)__init_end; addr += PAGE_SIZE) {
 		memset((void *)addr, 0xcc, PAGE_SIZE);
 		ClearPageReserved(virt_to_page(addr));
-		set_page_count(virt_to_page(addr), 1);
+		init_page_count(virt_to_page(addr));
 		free_page(addr);
 		totalram_pages++;
 	}
@@ -155,7 +155,7 @@ void free_initrd_mem(unsigned long start, unsigned long end)
 		printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10);
 	for (; start < end; start += PAGE_SIZE) {
 		ClearPageReserved(virt_to_page(start));
-		set_page_count(virt_to_page(start), 1);
+		init_page_count(virt_to_page(start));
 		free_page(start);
 		totalram_pages++;
 	}
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 550517c2dd4..454cac01d8c 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -108,8 +108,8 @@ EXPORT_SYMBOL(phys_mem_access_prot);
 void online_page(struct page *page)
 {
 	ClearPageReserved(page);
-	set_page_count(page, 0);
-	free_cold_page(page);
+	init_page_count(page);
+	__free_page(page);
 	totalram_pages++;
 	num_physpages++;
 }
@@ -376,7 +376,7 @@ void __init mem_init(void)
 			struct page *page = pfn_to_page(pfn);
 
 			ClearPageReserved(page);
-			set_page_count(page, 1);
+			init_page_count(page);
 			__free_page(page);
 			totalhigh_pages++;
 		}
diff --git a/arch/powerpc/platforms/cell/setup.c b/arch/powerpc/platforms/cell/setup.c
index b33a4443f5a..fec8e65b36e 100644
--- a/arch/powerpc/platforms/cell/setup.c
+++ b/arch/powerpc/platforms/cell/setup.c
@@ -115,7 +115,7 @@ static void __init cell_spuprop_present(struct device_node *spe,
 		for (pfn = start_pfn; pfn < end_pfn; pfn++) {
 			struct page *page = pfn_to_page(pfn);
 			set_page_links(page, ZONE_DMA, node_id, pfn);
-			set_page_count(page, 1);
+			init_page_count(page);
 			reset_page_mapcount(page);
 			SetPageReserved(page);
 			INIT_LIST_HEAD(&page->lru);
diff --git a/arch/ppc/kernel/dma-mapping.c b/arch/ppc/kernel/dma-mapping.c
index 685fd0defe2..61465ec88bc 100644
--- a/arch/ppc/kernel/dma-mapping.c
+++ b/arch/ppc/kernel/dma-mapping.c
@@ -223,6 +223,8 @@ __dma_alloc_coherent(size_t size, dma_addr_t *handle, gfp_t gfp)
 		pte_t *pte = consistent_pte + CONSISTENT_OFFSET(vaddr);
 		struct page *end = page + (1 << order);
 
+		split_page(page, order);
+
 		/*
 		 * Set the "dma handle"
 		 */
@@ -231,7 +233,6 @@ __dma_alloc_coherent(size_t size, dma_addr_t *handle, gfp_t gfp)
 		do {
 			BUG_ON(!pte_none(*pte));
 
-			set_page_count(page, 1);
 			SetPageReserved(page);
 			set_pte_at(&init_mm, vaddr,
 				   pte, mk_pte(page, pgprot_noncached(PAGE_KERNEL)));
@@ -244,7 +245,6 @@ __dma_alloc_coherent(size_t size, dma_addr_t *handle, gfp_t gfp)
 		 * Free the otherwise unused pages.
 		 */
 		while (page < end) {
-			set_page_count(page, 1);
 			__free_page(page);
 			page++;
 		}
diff --git a/arch/ppc/mm/init.c b/arch/ppc/mm/init.c
index 134db5c0420..cb1c294fb93 100644
--- a/arch/ppc/mm/init.c
+++ b/arch/ppc/mm/init.c
@@ -140,7 +140,7 @@ static void free_sec(unsigned long start, unsigned long end, const char *name)
 
 	while (start < end) {
 		ClearPageReserved(virt_to_page(start));
-		set_page_count(virt_to_page(start), 1);
+		init_page_count(virt_to_page(start));
 		free_page(start);
 		cnt++;
 		start += PAGE_SIZE;
@@ -172,7 +172,7 @@ void free_initrd_mem(unsigned long start, unsigned long end)
 
 	for (; start < end; start += PAGE_SIZE) {
 		ClearPageReserved(virt_to_page(start));
-		set_page_count(virt_to_page(start), 1);
+		init_page_count(virt_to_page(start));
 		free_page(start);
 		totalram_pages++;
 	}
@@ -441,7 +441,7 @@ void __init mem_init(void)
 			struct page *page = mem_map + pfn;
 
 			ClearPageReserved(page);
-			set_page_count(page, 1);
+			init_page_count(page);
 			__free_page(page);
 			totalhigh_pages++;
 		}
diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
index df953383724..a055894f3bd 100644
--- a/arch/s390/mm/init.c
+++ b/arch/s390/mm/init.c
@@ -292,7 +292,7 @@ void free_initmem(void)
         addr = (unsigned long)(&__init_begin);
         for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) {
 		ClearPageReserved(virt_to_page(addr));
-		set_page_count(virt_to_page(addr), 1);
+		init_page_count(virt_to_page(addr));
 		free_page(addr);
 		totalram_pages++;
         }
@@ -307,7 +307,7 @@ void free_initrd_mem(unsigned long start, unsigned long end)
                 printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10);
         for (; start < end; start += PAGE_SIZE) {
                 ClearPageReserved(virt_to_page(start));
-                set_page_count(virt_to_page(start), 1);
+                init_page_count(virt_to_page(start));
                 free_page(start);
                 totalram_pages++;
         }
diff --git a/arch/sh/mm/consistent.c b/arch/sh/mm/consistent.c
index df3a9e452cc..ee73e30263a 100644
--- a/arch/sh/mm/consistent.c
+++ b/arch/sh/mm/consistent.c
@@ -23,6 +23,7 @@ void *consistent_alloc(gfp_t gfp, size_t size, dma_addr_t *handle)
 	page = alloc_pages(gfp, order);
 	if (!page)
 		return NULL;
+	split_page(page, order);
 
 	ret = page_address(page);
 	*handle = virt_to_phys(ret);
@@ -37,8 +38,6 @@ void *consistent_alloc(gfp_t gfp, size_t size, dma_addr_t *handle)
 	end  = page + (1 << order);
 
 	while (++page < end) {
-		set_page_count(page, 1);
-
 		/* Free any unused pages */
 		if (page >= free) {
 			__free_page(page);
diff --git a/arch/sh/mm/hugetlbpage.c b/arch/sh/mm/hugetlbpage.c
index 6b7a7688c98..a3568fd5150 100644
--- a/arch/sh/mm/hugetlbpage.c
+++ b/arch/sh/mm/hugetlbpage.c
@@ -84,18 +84,6 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
 	return entry;
 }
 
-/*
- * This function checks for proper alignment of input addr and len parameters.
- */
-int is_aligned_hugepage_range(unsigned long addr, unsigned long len)
-{
-	if (len & ~HPAGE_MASK)
-		return -EINVAL;
-	if (addr & ~HPAGE_MASK)
-		return -EINVAL;
-	return 0;
-}
-
 struct page *follow_huge_addr(struct mm_struct *mm,
 			      unsigned long address, int write)
 {
diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c
index e342565f75f..77b4a838fe1 100644
--- a/arch/sh/mm/init.c
+++ b/arch/sh/mm/init.c
@@ -273,7 +273,7 @@ void free_initmem(void)
 	addr = (unsigned long)(&__init_begin);
 	for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) {
 		ClearPageReserved(virt_to_page(addr));
-		set_page_count(virt_to_page(addr), 1);
+		init_page_count(virt_to_page(addr));
 		free_page(addr);
 		totalram_pages++;
 	}
@@ -286,7 +286,7 @@ void free_initrd_mem(unsigned long start, unsigned long end)
 	unsigned long p;
 	for (p = start; p < end; p += PAGE_SIZE) {
 		ClearPageReserved(virt_to_page(p));
-		set_page_count(virt_to_page(p), 1);
+		init_page_count(virt_to_page(p));
 		free_page(p);
 		totalram_pages++;
 	}
diff --git a/arch/sh64/mm/hugetlbpage.c b/arch/sh64/mm/hugetlbpage.c
index ed6a505b3ee..3d89f2a6c78 100644
--- a/arch/sh64/mm/hugetlbpage.c
+++ b/arch/sh64/mm/hugetlbpage.c
@@ -84,18 +84,6 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
 	return entry;
 }
 
-/*
- * This function checks for proper alignment of input addr and len parameters.
- */
-int is_aligned_hugepage_range(unsigned long addr, unsigned long len)
-{
-	if (len & ~HPAGE_MASK)
-		return -EINVAL;
-	if (addr & ~HPAGE_MASK)
-		return -EINVAL;
-	return 0;
-}
-
 struct page *follow_huge_addr(struct mm_struct *mm,
 			      unsigned long address, int write)
 {
diff --git a/arch/sh64/mm/init.c b/arch/sh64/mm/init.c
index a65e8bb2c3c..1169757fb38 100644
--- a/arch/sh64/mm/init.c
+++ b/arch/sh64/mm/init.c
@@ -173,7 +173,7 @@ void free_initmem(void)
 	addr = (unsigned long)(&__init_begin);
 	for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) {
 		ClearPageReserved(virt_to_page(addr));
-		set_page_count(virt_to_page(addr), 1);
+		init_page_count(virt_to_page(addr));
 		free_page(addr);
 		totalram_pages++;
 	}
@@ -186,7 +186,7 @@ void free_initrd_mem(unsigned long start, unsigned long end)
 	unsigned long p;
 	for (p = start; p < end; p += PAGE_SIZE) {
 		ClearPageReserved(virt_to_page(p));
-		set_page_count(virt_to_page(p), 1);
+		init_page_count(virt_to_page(p));
 		free_page(p);
 		totalram_pages++;
 	}
diff --git a/arch/sparc/kernel/sun4d_smp.c b/arch/sparc/kernel/sun4d_smp.c
index 40d426cce82..4219dd2ce3a 100644
--- a/arch/sparc/kernel/sun4d_smp.c
+++ b/arch/sparc/kernel/sun4d_smp.c
@@ -266,19 +266,19 @@ void __init smp4d_boot_cpus(void)
 
 	/* Free unneeded trap tables */
 	ClearPageReserved(virt_to_page(trapbase_cpu1));
-	set_page_count(virt_to_page(trapbase_cpu1), 1);
+	init_page_count(virt_to_page(trapbase_cpu1));
 	free_page((unsigned long)trapbase_cpu1);
 	totalram_pages++;
 	num_physpages++;
 
 	ClearPageReserved(virt_to_page(trapbase_cpu2));
-	set_page_count(virt_to_page(trapbase_cpu2), 1);
+	init_page_count(virt_to_page(trapbase_cpu2));
 	free_page((unsigned long)trapbase_cpu2);
 	totalram_pages++;
 	num_physpages++;
 
 	ClearPageReserved(virt_to_page(trapbase_cpu3));
-	set_page_count(virt_to_page(trapbase_cpu3), 1);
+	init_page_count(virt_to_page(trapbase_cpu3));
 	free_page((unsigned long)trapbase_cpu3);
 	totalram_pages++;
 	num_physpages++;
diff --git a/arch/sparc/kernel/sun4m_smp.c b/arch/sparc/kernel/sun4m_smp.c
index a21f27d10e5..fbbd8a474c4 100644
--- a/arch/sparc/kernel/sun4m_smp.c
+++ b/arch/sparc/kernel/sun4m_smp.c
@@ -233,21 +233,21 @@ void __init smp4m_boot_cpus(void)
 	/* Free unneeded trap tables */
 	if (!cpu_isset(i, cpu_present_map)) {
 		ClearPageReserved(virt_to_page(trapbase_cpu1));
-		set_page_count(virt_to_page(trapbase_cpu1), 1);
+		init_page_count(virt_to_page(trapbase_cpu1));
 		free_page((unsigned long)trapbase_cpu1);
 		totalram_pages++;
 		num_physpages++;
 	}
 	if (!cpu_isset(2, cpu_present_map)) {
 		ClearPageReserved(virt_to_page(trapbase_cpu2));
-		set_page_count(virt_to_page(trapbase_cpu2), 1);
+		init_page_count(virt_to_page(trapbase_cpu2));
 		free_page((unsigned long)trapbase_cpu2);
 		totalram_pages++;
 		num_physpages++;
 	}
 	if (!cpu_isset(3, cpu_present_map)) {
 		ClearPageReserved(virt_to_page(trapbase_cpu3));
-		set_page_count(virt_to_page(trapbase_cpu3), 1);
+		init_page_count(virt_to_page(trapbase_cpu3));
 		free_page((unsigned long)trapbase_cpu3);
 		totalram_pages++;
 		num_physpages++;
diff --git a/arch/sparc/mm/init.c b/arch/sparc/mm/init.c
index c03babaa049..89866973246 100644
--- a/arch/sparc/mm/init.c
+++ b/arch/sparc/mm/init.c
@@ -383,7 +383,7 @@ void map_high_region(unsigned long start_pfn, unsigned long end_pfn)
 		struct page *page = pfn_to_page(tmp);
 
 		ClearPageReserved(page);
-		set_page_count(page, 1);
+		init_page_count(page);
 		__free_page(page);
 		totalhigh_pages++;
 	}
@@ -480,7 +480,7 @@ void free_initmem (void)
 		p = virt_to_page(addr);
 
 		ClearPageReserved(p);
-		set_page_count(p, 1);
+		init_page_count(p);
 		__free_page(p);
 		totalram_pages++;
 		num_physpages++;
@@ -497,7 +497,7 @@ void free_initrd_mem(unsigned long start, unsigned long end)
 		struct page *p = virt_to_page(start);
 
 		ClearPageReserved(p);
-		set_page_count(p, 1);
+		init_page_count(p);
 		__free_page(p);
 		num_physpages++;
 	}
diff --git a/arch/sparc64/mm/hugetlbpage.c b/arch/sparc64/mm/hugetlbpage.c
index a7a24869d04..280dc7958a1 100644
--- a/arch/sparc64/mm/hugetlbpage.c
+++ b/arch/sparc64/mm/hugetlbpage.c
@@ -263,18 +263,6 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
 	return entry;
 }
 
-/*
- * This function checks for proper alignment of input addr and len parameters.
- */
-int is_aligned_hugepage_range(unsigned long addr, unsigned long len)
-{
-	if (len & ~HPAGE_MASK)
-		return -EINVAL;
-	if (addr & ~HPAGE_MASK)
-		return -EINVAL;
-	return 0;
-}
-
 struct page *follow_huge_addr(struct mm_struct *mm,
 			      unsigned long address, int write)
 {
diff --git a/arch/sparc64/mm/init.c b/arch/sparc64/mm/init.c
index c2b556106fc..2ae143ba50d 100644
--- a/arch/sparc64/mm/init.c
+++ b/arch/sparc64/mm/init.c
@@ -1461,7 +1461,7 @@ void free_initmem(void)
 		p = virt_to_page(page);
 
 		ClearPageReserved(p);
-		set_page_count(p, 1);
+		init_page_count(p);
 		__free_page(p);
 		num_physpages++;
 		totalram_pages++;
@@ -1477,7 +1477,7 @@ void free_initrd_mem(unsigned long start, unsigned long end)
 		struct page *p = virt_to_page(start);
 
 		ClearPageReserved(p);
-		set_page_count(p, 1);
+		init_page_count(p);
 		__free_page(p);
 		num_physpages++;
 		totalram_pages++;
diff --git a/arch/um/kernel/mem.c b/arch/um/kernel/mem.c
index fa4f915be5c..92cce96b5e2 100644
--- a/arch/um/kernel/mem.c
+++ b/arch/um/kernel/mem.c
@@ -57,7 +57,7 @@ static void setup_highmem(unsigned long highmem_start,
 	for(i = 0; i < highmem_len >> PAGE_SHIFT; i++){
 		page = &mem_map[highmem_pfn + i];
 		ClearPageReserved(page);
-		set_page_count(page, 1);
+		init_page_count(page);
 		__free_page(page);
 	}
 }
@@ -296,7 +296,7 @@ void free_initrd_mem(unsigned long start, unsigned long end)
 			(end - start) >> 10);
 	for (; start < end; start += PAGE_SIZE) {
 		ClearPageReserved(virt_to_page(start));
-		set_page_count(virt_to_page(start), 1);
+		init_page_count(virt_to_page(start));
 		free_page(start);
 		totalram_pages++;
 	}
diff --git a/arch/um/kernel/physmem.c b/arch/um/kernel/physmem.c
index 544665e0451..0e65340eee3 100644
--- a/arch/um/kernel/physmem.c
+++ b/arch/um/kernel/physmem.c
@@ -279,7 +279,7 @@ int init_maps(unsigned long physmem, unsigned long iomem, unsigned long highmem)
 
 	for(i = 0; i < total_pages; i++){
 		p = &map[i];
-		set_page_count(p, 0);
+		memset(p, 0, sizeof(struct page));
 		SetPageReserved(p);
 		INIT_LIST_HEAD(&p->lru);
 	}
diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c
index 3080f84bf7b..ee5ce3d3cbc 100644
--- a/arch/x86_64/kernel/time.c
+++ b/arch/x86_64/kernel/time.c
@@ -477,7 +477,7 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
 	return IRQ_HANDLED;
 }
 
-static unsigned int cyc2ns_scale;
+static unsigned int cyc2ns_scale __read_mostly;
 #define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
 
 static inline void set_cyc2ns_scale(unsigned long cpu_khz)
diff --git a/arch/x86_64/kernel/x8664_ksyms.c b/arch/x86_64/kernel/x8664_ksyms.c
index 3496abc8d37..c9dc7e46731 100644
--- a/arch/x86_64/kernel/x8664_ksyms.c
+++ b/arch/x86_64/kernel/x8664_ksyms.c
@@ -124,6 +124,7 @@ extern void * __memcpy(void *,const void *,__kernel_size_t);
 
 EXPORT_SYMBOL(memset);
 EXPORT_SYMBOL(strlen);
+EXPORT_SYMBOL(strpbrk);
 EXPORT_SYMBOL(memmove);
 EXPORT_SYMBOL(memcpy);
 EXPORT_SYMBOL(__memcpy);
diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c
index 7af1742aa95..40ed13d263c 100644
--- a/arch/x86_64/mm/init.c
+++ b/arch/x86_64/mm/init.c
@@ -486,7 +486,7 @@ void __init clear_kernel_mapping(unsigned long address, unsigned long size)
 void online_page(struct page *page)
 {
 	ClearPageReserved(page);
-	set_page_count(page, 1);
+	init_page_count(page);
 	__free_page(page);
 	totalram_pages++;
 	num_physpages++;
@@ -592,7 +592,7 @@ void free_initmem(void)
 	addr = (unsigned long)(&__init_begin);
 	for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) {
 		ClearPageReserved(virt_to_page(addr));
-		set_page_count(virt_to_page(addr), 1);
+		init_page_count(virt_to_page(addr));
 		memset((void *)(addr & ~(PAGE_SIZE-1)), 0xcc, PAGE_SIZE); 
 		free_page(addr);
 		totalram_pages++;
@@ -632,7 +632,7 @@ void free_initrd_mem(unsigned long start, unsigned long end)
 	printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10);
 	for (; start < end; start += PAGE_SIZE) {
 		ClearPageReserved(virt_to_page(start));
-		set_page_count(virt_to_page(start), 1);
+		init_page_count(virt_to_page(start));
 		free_page(start);
 		totalram_pages++;
 	}
diff --git a/arch/x86_64/mm/pageattr.c b/arch/x86_64/mm/pageattr.c
index 35f1f1aab06..531ad21447b 100644
--- a/arch/x86_64/mm/pageattr.c
+++ b/arch/x86_64/mm/pageattr.c
@@ -45,6 +45,13 @@ static struct page *split_large_page(unsigned long address, pgprot_t prot,
 	pte_t *pbase;
 	if (!base) 
 		return NULL;
+	/*
+	 * page_private is used to track the number of entries in
+	 * the page table page have non standard attributes.
+	 */
+	SetPagePrivate(base);
+	page_private(base) = 0;
+
 	address = __pa(address);
 	addr = address & LARGE_PAGE_MASK; 
 	pbase = (pte_t *)page_address(base);
@@ -77,26 +84,12 @@ static inline void flush_map(unsigned long address)
 	on_each_cpu(flush_kernel_map, (void *)address, 1, 1);
 }
 
-struct deferred_page { 
-	struct deferred_page *next; 
-	struct page *fpage;
-	unsigned long address;
-}; 
-static struct deferred_page *df_list; /* protected by init_mm.mmap_sem */
+static struct page *deferred_pages; /* protected by init_mm.mmap_sem */
 
-static inline void save_page(unsigned long address, struct page *fpage)
+static inline void save_page(struct page *fpage)
 {
-	struct deferred_page *df;
-	df = kmalloc(sizeof(struct deferred_page), GFP_KERNEL); 
-	if (!df) {
-		flush_map(address);
-		__free_page(fpage);
-	} else { 
-		df->next = df_list;
-		df->fpage = fpage;
-		df->address = address;
-		df_list = df;
-	} 			
+	fpage->lru.next = (struct list_head *)deferred_pages;
+	deferred_pages = fpage;
 }
 
 /* 
@@ -138,8 +131,8 @@ __change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot,
 			set_pte(kpte, pfn_pte(pfn, prot));
 		} else {
  			/*
- 			 * split_large_page will take the reference for this change_page_attr
- 			 * on the split page.
+			 * split_large_page will take the reference for this
+			 * change_page_attr on the split page.
  			 */
 
 			struct page *split;
@@ -151,23 +144,20 @@ __change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot,
 			set_pte(kpte,mk_pte(split, ref_prot2));
 			kpte_page = split;
 		}	
-		get_page(kpte_page);
+		page_private(kpte_page)++;
 	} else if ((kpte_flags & _PAGE_PSE) == 0) { 
 		set_pte(kpte, pfn_pte(pfn, ref_prot));
-		__put_page(kpte_page);
+		BUG_ON(page_private(kpte_page) == 0);
+		page_private(kpte_page)--;
 	} else
 		BUG();
 
 	/* on x86-64 the direct mapping set at boot is not using 4k pages */
  	BUG_ON(PageReserved(kpte_page));
 
-	switch (page_count(kpte_page)) {
- 	case 1:
-		save_page(address, kpte_page); 		     
+	if (page_private(kpte_page) == 0) {
+		save_page(kpte_page);
 		revert_page(address, ref_prot);
-		break;
- 	case 0:
- 		BUG(); /* memleak and failed 2M page regeneration */
  	}
 	return 0;
 } 
@@ -220,17 +210,18 @@ int change_page_attr(struct page *page, int numpages, pgprot_t prot)
 
 void global_flush_tlb(void)
 { 
-	struct deferred_page *df, *next_df;
+	struct page *dpage;
 
 	down_read(&init_mm.mmap_sem);
-	df = xchg(&df_list, NULL);
+	dpage = xchg(&deferred_pages, NULL);
 	up_read(&init_mm.mmap_sem);
-	flush_map((df && !df->next) ? df->address : 0);
-	for (; df; df = next_df) { 
-		next_df = df->next;
-		if (df->fpage) 
-			__free_page(df->fpage);
-		kfree(df);
+
+	flush_map((dpage && !dpage->lru.next) ? (unsigned long)page_address(dpage) : 0);
+	while (dpage) {
+		struct page *tmp = dpage;
+		dpage = (struct page *)dpage->lru.next;
+		ClearPagePrivate(tmp);
+		__free_page(tmp);
 	} 
 } 
 
diff --git a/arch/xtensa/mm/init.c b/arch/xtensa/mm/init.c
index 5a91d6c9e66..e1be4235f36 100644
--- a/arch/xtensa/mm/init.c
+++ b/arch/xtensa/mm/init.c
@@ -272,7 +272,7 @@ free_reserved_mem(void *start, void *end)
 {
 	for (; start < end; start += PAGE_SIZE) {
 		ClearPageReserved(virt_to_page(start));
-		set_page_count(virt_to_page(start), 1);
+		init_page_count(virt_to_page(start));
 		free_page((unsigned long)start);
 		totalram_pages++;
 	}
diff --git a/arch/xtensa/mm/pgtable.c b/arch/xtensa/mm/pgtable.c
index e5e119c820e..7d28914d11c 100644
--- a/arch/xtensa/mm/pgtable.c
+++ b/arch/xtensa/mm/pgtable.c
@@ -14,25 +14,21 @@
 
 pte_t* pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
 {
-	pte_t *pte, p;
+	pte_t *pte = NULL, *p;
 	int color = ADDR_COLOR(address);
 	int i;
 
 	p = (pte_t*) __get_free_pages(GFP_KERNEL|__GFP_REPEAT, COLOR_ORDER);
 
 	if (likely(p)) {
-		struct page *page;
-
-		for (i = 0; i < COLOR_SIZE; i++, p++) {
-			page = virt_to_page(pte);
-
-			set_page_count(page, 1);
-			ClearPageCompound(page);
+		split_page(virt_to_page(p), COLOR_ORDER);
 
+		for (i = 0; i < COLOR_SIZE; i++) {
 			if (ADDR_COLOR(p) == color)
 				pte = p;
 			else
 				free_page(p);
+			p += PTRS_PER_PTE;
 		}
 		clear_page(pte);
 	}
@@ -49,20 +45,20 @@ int flush;
 
 struct page* pte_alloc_one(struct mm_struct *mm, unsigned long address)
 {
-	struct page *page, p;
+	struct page *page = NULL, *p;
 	int color = ADDR_COLOR(address);
 
 	p = alloc_pages(GFP_KERNEL | __GFP_REPEAT, PTE_ORDER);
 
 	if (likely(p)) {
-		for (i = 0; i < PAGE_ORDER; i++) {
-			set_page_count(p, 1);
-			ClearPageCompound(p);
+		split_page(p, COLOR_ORDER);
 
-			if (PADDR_COLOR(page_address(pg)) == color)
+		for (i = 0; i < PAGE_ORDER; i++) {
+			if (PADDR_COLOR(page_address(p)) == color)
 				page = p;
 			else
-				free_page(p);
+				__free_page(p);
+			p++;
 		}
 		clear_highpage(page);
 	}
diff --git a/drivers/char/snsc.h b/drivers/char/snsc.h
index a9efc13cc85..8a98169b60c 100644
--- a/drivers/char/snsc.h
+++ b/drivers/char/snsc.h
@@ -5,7 +5,7 @@
  * License.  See the file "COPYING" in the main directory of this archive
  * for more details.
  *
- * Copyright (C) 2004 Silicon Graphics, Inc. All rights reserved.
+ * Copyright (C) 2004-2006 Silicon Graphics, Inc. All rights reserved.
  */
 
 /*
@@ -70,6 +70,9 @@ struct sysctl_data_s {
 #define EV_CLASS_TEST_WARNING	0x6000ul
 #define EV_CLASS_PWRD_NOTIFY	0x8000ul
 
+/* ENV class codes */
+#define ENV_PWRDN_PEND		0x4101ul
+
 #define EV_SEVERITY_POWER_STABLE	0x0000ul
 #define EV_SEVERITY_POWER_LOW_WARNING	0x0100ul
 #define EV_SEVERITY_POWER_HIGH_WARNING	0x0200ul
diff --git a/drivers/char/snsc_event.c b/drivers/char/snsc_event.c
index baaa365285f..a4fa507eed9 100644
--- a/drivers/char/snsc_event.c
+++ b/drivers/char/snsc_event.c
@@ -5,7 +5,7 @@
  * License.  See the file "COPYING" in the main directory of this archive
  * for more details.
  *
- * Copyright (C) 2004 Silicon Graphics, Inc. All rights reserved.
+ * Copyright (C) 2004-2006 Silicon Graphics, Inc. All rights reserved.
  */
 
 /*
@@ -187,7 +187,8 @@ scdrv_event_severity(int code)
 static void
 scdrv_dispatch_event(char *event, int len)
 {
-	int code, esp_code, src;
+	static int snsc_shutting_down = 0;
+	int code, esp_code, src, class;
 	char desc[CHUNKSIZE];
 	char *severity;
 
@@ -199,9 +200,25 @@ scdrv_dispatch_event(char *event, int len)
 	/* how urgent is the message? */
 	severity = scdrv_event_severity(code);
 
-	if ((code & EV_CLASS_MASK) == EV_CLASS_PWRD_NOTIFY) {
+	class = (code & EV_CLASS_MASK);
+
+	if (class == EV_CLASS_PWRD_NOTIFY || code == ENV_PWRDN_PEND) {
 		struct task_struct *p;
 
+		if (snsc_shutting_down)
+			return;
+
+		snsc_shutting_down = 1;
+
+		/* give a message for each type of event */
+		if (class == EV_CLASS_PWRD_NOTIFY)
+			printk(KERN_NOTICE "Power off indication received."
+			       " Sending SIGPWR to init...\n");
+		else if (code == ENV_PWRDN_PEND)
+			printk(KERN_CRIT "WARNING: Shutting down the system"
+			       " due to a critical environmental condition."
+			       " Sending SIGPWR to init...\n");
+
 		/* give a SIGPWR signal to init proc */
 
 		/* first find init's task */
@@ -210,12 +227,11 @@ scdrv_dispatch_event(char *event, int len)
 			if (p->pid == 1)
 				break;
 		}
-		if (p) { /* we found init's task */
-			printk(KERN_EMERG "Power off indication received. Initiating power fail sequence...\n");
+		if (p) {
 			force_sig(SIGPWR, p);
-		} else { /* failed to find init's task - just give message(s) */
-			printk(KERN_WARNING "Failed to find init proc to handle power off!\n");
-			printk("%s|$(0x%x)%s\n", severity, esp_code, desc);
+		} else {
+			printk(KERN_ERR "Failed to signal init!\n");
+			snsc_shutting_down = 0; /* so can try again (?) */
 		}
 		read_unlock(&tasklist_lock);
 	} else {
diff --git a/drivers/char/tb0219.c b/drivers/char/tb0219.c
index ac2a297ce37..a80c8321087 100644
--- a/drivers/char/tb0219.c
+++ b/drivers/char/tb0219.c
@@ -283,7 +283,7 @@ static void tb0219_pci_irq_init(void)
 	vr41xx_set_irq_level(TB0219_PCI_SLOT3_PIN, IRQ_LEVEL_LOW);
 }
 
-static int tb0219_probe(struct platform_device *dev)
+static int __devinit tb0219_probe(struct platform_device *dev)
 {
 	int retval;
 
@@ -319,7 +319,7 @@ static int tb0219_probe(struct platform_device *dev)
 	return 0;
 }
 
-static int tb0219_remove(struct platform_device *dev)
+static int __devexit tb0219_remove(struct platform_device *dev)
 {
 	_machine_restart = old_machine_restart;
 
@@ -335,19 +335,26 @@ static struct platform_device *tb0219_platform_device;
 
 static struct platform_driver tb0219_device_driver = {
 	.probe		= tb0219_probe,
-	.remove		= tb0219_remove,
+	.remove		= __devexit_p(tb0219_remove),
 	.driver		= {
 		.name	= "TB0219",
+		.owner	= THIS_MODULE,
 	},
 };
 
-static int __devinit tanbac_tb0219_init(void)
+static int __init tanbac_tb0219_init(void)
 {
 	int retval;
 
-	tb0219_platform_device = platform_device_register_simple("TB0219", -1, NULL, 0);
-	if (IS_ERR(tb0219_platform_device))
-		return PTR_ERR(tb0219_platform_device);
+	tb0219_platform_device = platform_device_alloc("TB0219", -1);
+	if (!tb0219_platform_device)
+		return -ENOMEM;
+
+	retval = platform_device_add(tb0219_platform_device);
+	if (retval < 0) {
+		platform_device_put(tb0219_platform_device);
+		return retval;
+	}
 
 	retval = platform_driver_register(&tb0219_device_driver);
 	if (retval < 0)
@@ -356,10 +363,9 @@ static int __devinit tanbac_tb0219_init(void)
 	return retval;
 }
 
-static void __devexit tanbac_tb0219_exit(void)
+static void __exit tanbac_tb0219_exit(void)
 {
 	platform_driver_unregister(&tb0219_device_driver);
-
 	platform_device_unregister(tb0219_platform_device);
 }
 
diff --git a/drivers/char/vr41xx_giu.c b/drivers/char/vr41xx_giu.c
index 2267c7b8179..05e6e814d86 100644
--- a/drivers/char/vr41xx_giu.c
+++ b/drivers/char/vr41xx_giu.c
@@ -613,7 +613,7 @@ static struct file_operations gpio_fops = {
 	.release	= gpio_release,
 };
 
-static int giu_probe(struct platform_device *dev)
+static int __devinit giu_probe(struct platform_device *dev)
 {
 	unsigned long start, size, flags = 0;
 	unsigned int nr_pins = 0;
@@ -697,7 +697,7 @@ static int giu_probe(struct platform_device *dev)
 	return cascade_irq(GIUINT_IRQ, giu_get_irq);
 }
 
-static int giu_remove(struct platform_device *dev)
+static int __devexit giu_remove(struct platform_device *dev)
 {
 	iounmap(giu_base);
 
@@ -712,9 +712,10 @@ static struct platform_device *giu_platform_device;
 
 static struct platform_driver giu_device_driver = {
 	.probe		= giu_probe,
-	.remove		= giu_remove,
+	.remove		= __devexit_p(giu_remove),
 	.driver		= {
 		.name	= "GIU",
+		.owner	= THIS_MODULE,
 	},
 };
 
@@ -722,9 +723,15 @@ static int __init vr41xx_giu_init(void)
 {
 	int retval;
 
-	giu_platform_device = platform_device_register_simple("GIU", -1, NULL, 0);
-	if (IS_ERR(giu_platform_device))
-		return PTR_ERR(giu_platform_device);
+	giu_platform_device = platform_device_alloc("GIU", -1);
+	if (!giu_platform_device)
+		return -ENOMEM;
+
+	retval = platform_device_add(giu_platform_device);
+	if (retval < 0) {
+		platform_device_put(giu_platform_device);
+		return retval;
+	}
 
 	retval = platform_driver_register(&giu_device_driver);
 	if (retval < 0)
diff --git a/drivers/char/vr41xx_rtc.c b/drivers/char/vr41xx_rtc.c
index bc1b4a15212..b109d9a502d 100644
--- a/drivers/char/vr41xx_rtc.c
+++ b/drivers/char/vr41xx_rtc.c
@@ -558,7 +558,7 @@ static struct miscdevice rtc_miscdevice = {
 	.fops	= &rtc_fops,
 };
 
-static int rtc_probe(struct platform_device *pdev)
+static int __devinit rtc_probe(struct platform_device *pdev)
 {
 	unsigned int irq;
 	int retval;
@@ -631,7 +631,7 @@ static int rtc_probe(struct platform_device *pdev)
 	return 0;
 }
 
-static int rtc_remove(struct platform_device *dev)
+static int __devexit rtc_remove(struct platform_device *dev)
 {
 	int retval;
 
@@ -653,13 +653,14 @@ static struct platform_device *rtc_platform_device;
 
 static struct platform_driver rtc_device_driver = {
 	.probe		= rtc_probe,
-	.remove		= rtc_remove,
+	.remove		= __devexit_p(rtc_remove),
 	.driver		= {
 		.name	= rtc_name,
+		.owner	= THIS_MODULE,
 	},
 };
 
-static int __devinit vr41xx_rtc_init(void)
+static int __init vr41xx_rtc_init(void)
 {
 	int retval;
 
@@ -684,10 +685,20 @@ static int __devinit vr41xx_rtc_init(void)
 		break;
 	}
 
-	rtc_platform_device = platform_device_register_simple("RTC", -1,
-			      rtc_resource, ARRAY_SIZE(rtc_resource));
-	if (IS_ERR(rtc_platform_device))
-		return PTR_ERR(rtc_platform_device);
+	rtc_platform_device = platform_device_alloc("RTC", -1);
+	if (!rtc_platform_device)
+		return -ENOMEM;
+
+	retval = platform_device_add_resources(rtc_platform_device,
+				rtc_resource, ARRAY_SIZE(rtc_resource));
+
+	if (retval == 0)
+		retval = platform_device_add(rtc_platform_device);
+
+	if (retval < 0) {
+		platform_device_put(rtc_platform_device);
+		return retval;
+	}
 
 	retval = platform_driver_register(&rtc_device_driver);
 	if (retval < 0)
@@ -696,10 +707,9 @@ static int __devinit vr41xx_rtc_init(void)
 	return retval;
 }
 
-static void __devexit vr41xx_rtc_exit(void)
+static void __exit vr41xx_rtc_exit(void)
 {
 	platform_driver_unregister(&rtc_device_driver);
-
 	platform_device_unregister(rtc_platform_device);
 }
 
diff --git a/drivers/char/watchdog/mv64x60_wdt.c b/drivers/char/watchdog/mv64x60_wdt.c
index 00d9ef04a36..f1b9cf89f15 100644
--- a/drivers/char/watchdog/mv64x60_wdt.c
+++ b/drivers/char/watchdog/mv64x60_wdt.c
@@ -228,15 +228,25 @@ static int __init mv64x60_wdt_init(void)
 
 	printk(KERN_INFO "MV64x60 watchdog driver\n");
 
-	mv64x60_wdt_dev = platform_device_register_simple(MV64x60_WDT_NAME,
-							  -1, NULL, 0);
-	if (IS_ERR(mv64x60_wdt_dev)) {
-		ret = PTR_ERR(mv64x60_wdt_dev);
+	mv64x60_wdt_dev = platform_device_alloc(MV64x60_WDT_NAME, -1);
+	if (!mv64x60_wdt_dev) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	ret = platform_device_add(mv64x60_wdt_dev);
+	if (ret) {
+		platform_device_put(mv64x60_wdt_dev);
 		goto out;
 	}
 
 	ret = platform_driver_register(&mv64x60_wdt_driver);
-      out:
+	if (ret) {
+		platform_device_unregister(mv64x60_wdt_dev);
+		goto out;
+	}
+
+ out:
 	return ret;
 }
 
diff --git a/drivers/firmware/dcdbas.c b/drivers/firmware/dcdbas.c
index 4652512f7d1..3a4e5c5b4e1 100644
--- a/drivers/firmware/dcdbas.c
+++ b/drivers/firmware/dcdbas.c
@@ -530,30 +530,27 @@ static DCDBAS_DEV_ATTR_RW(host_control_action);
 static DCDBAS_DEV_ATTR_RW(host_control_smi_type);
 static DCDBAS_DEV_ATTR_RW(host_control_on_shutdown);
 
-static struct device_attribute *dcdbas_dev_attrs[] = {
-	&dev_attr_smi_data_buf_size,
-	&dev_attr_smi_data_buf_phys_addr,
-	&dev_attr_smi_request,
-	&dev_attr_host_control_action,
-	&dev_attr_host_control_smi_type,
-	&dev_attr_host_control_on_shutdown,
+static struct attribute *dcdbas_dev_attrs[] = {
+	&dev_attr_smi_data_buf_size.attr,
+	&dev_attr_smi_data_buf_phys_addr.attr,
+	&dev_attr_smi_request.attr,
+	&dev_attr_host_control_action.attr,
+	&dev_attr_host_control_smi_type.attr,
+	&dev_attr_host_control_on_shutdown.attr,
 	NULL
 };
 
-/**
- * dcdbas_init: initialize driver
- */
-static int __init dcdbas_init(void)
+static struct attribute_group dcdbas_attr_group = {
+	.attrs = dcdbas_dev_attrs,
+};
+
+static int __devinit dcdbas_probe(struct platform_device *dev)
 {
-	int i;
+	int i, error;
 
 	host_control_action = HC_ACTION_NONE;
 	host_control_smi_type = HC_SMITYPE_NONE;
 
-	dcdbas_pdev = platform_device_register_simple(DRIVER_NAME, -1, NULL, 0);
-	if (IS_ERR(dcdbas_pdev))
-		return PTR_ERR(dcdbas_pdev);
-
 	/*
 	 * BIOS SMI calls require buffer addresses be in 32-bit address space.
 	 * This is done by setting the DMA mask below.
@@ -561,19 +558,79 @@ static int __init dcdbas_init(void)
 	dcdbas_pdev->dev.coherent_dma_mask = DMA_32BIT_MASK;
 	dcdbas_pdev->dev.dma_mask = &dcdbas_pdev->dev.coherent_dma_mask;
 
+	error = sysfs_create_group(&dev->dev.kobj, &dcdbas_attr_group);
+	if (error)
+		return error;
+
+	for (i = 0; dcdbas_bin_attrs[i]; i++) {
+		error = sysfs_create_bin_file(&dev->dev.kobj,
+					      dcdbas_bin_attrs[i]);
+		if (error) {
+			while (--i >= 0)
+				sysfs_remove_bin_file(&dev->dev.kobj,
+						      dcdbas_bin_attrs[i]);
+			sysfs_create_group(&dev->dev.kobj, &dcdbas_attr_group);
+			return error;
+		}
+	}
+
 	register_reboot_notifier(&dcdbas_reboot_nb);
 
+	dev_info(&dev->dev, "%s (version %s)\n",
+		 DRIVER_DESCRIPTION, DRIVER_VERSION);
+
+	return 0;
+}
+
+static int __devexit dcdbas_remove(struct platform_device *dev)
+{
+	int i;
+
+	unregister_reboot_notifier(&dcdbas_reboot_nb);
 	for (i = 0; dcdbas_bin_attrs[i]; i++)
-		sysfs_create_bin_file(&dcdbas_pdev->dev.kobj,
-				      dcdbas_bin_attrs[i]);
+		sysfs_remove_bin_file(&dev->dev.kobj, dcdbas_bin_attrs[i]);
+	sysfs_remove_group(&dev->dev.kobj, &dcdbas_attr_group);
 
-	for (i = 0; dcdbas_dev_attrs[i]; i++)
-		device_create_file(&dcdbas_pdev->dev, dcdbas_dev_attrs[i]);
+	return 0;
+}
 
-	dev_info(&dcdbas_pdev->dev, "%s (version %s)\n",
-		 DRIVER_DESCRIPTION, DRIVER_VERSION);
+static struct platform_driver dcdbas_driver = {
+	.driver		= {
+		.name	= DRIVER_NAME,
+		.owner	= THIS_MODULE,
+	},
+	.probe		= dcdbas_probe,
+	.remove		= __devexit_p(dcdbas_remove),
+};
+
+/**
+ * dcdbas_init: initialize driver
+ */
+static int __init dcdbas_init(void)
+{
+	int error;
+
+	error = platform_driver_register(&dcdbas_driver);
+	if (error)
+		return error;
+
+	dcdbas_pdev = platform_device_alloc(DRIVER_NAME, -1);
+	if (!dcdbas_pdev) {
+		error = -ENOMEM;
+		goto err_unregister_driver;
+	}
+
+	error = platform_device_add(dcdbas_pdev);
+	if (error)
+		goto err_free_device;
 
 	return 0;
+
+ err_free_device:
+	platform_device_put(dcdbas_pdev);
+ err_unregister_driver:
+	platform_driver_unregister(&dcdbas_driver);
+	return error;
 }
 
 /**
@@ -588,6 +645,15 @@ static void __exit dcdbas_exit(void)
 	unregister_reboot_notifier(&dcdbas_reboot_nb);
 	smi_data_buf_free();
 	platform_device_unregister(dcdbas_pdev);
+	platform_driver_unregister(&dcdbas_driver);
+
+	/*
+	 * We have to free the buffer here instead of dcdbas_remove
+	 * because only in module exit function we can be sure that
+	 * all sysfs attributes belonging to this module have been
+	 * released.
+	 */
+	smi_data_buf_free();
 }
 
 module_init(dcdbas_init);
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 88d60202b9d..26b08ee425c 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -533,30 +533,35 @@ static void __clone_and_map(struct clone_info *ci)
 
 	} else {
 		/*
-		 * Create two copy bios to deal with io that has
-		 * been split across a target.
+		 * Handle a bvec that must be split between two or more targets.
 		 */
 		struct bio_vec *bv = bio->bi_io_vec + ci->idx;
+		sector_t remaining = to_sector(bv->bv_len);
+		unsigned int offset = 0;
 
-		clone = split_bvec(bio, ci->sector, ci->idx,
-				   bv->bv_offset, max);
-		__map_bio(ti, clone, tio);
+		do {
+			if (offset) {
+				ti = dm_table_find_target(ci->map, ci->sector);
+				max = max_io_len(ci->md, ci->sector, ti);
 
-		ci->sector += max;
-		ci->sector_count -= max;
-		ti = dm_table_find_target(ci->map, ci->sector);
-
-		len = to_sector(bv->bv_len) - max;
-		clone = split_bvec(bio, ci->sector, ci->idx,
-				   bv->bv_offset + to_bytes(max), len);
-		tio = alloc_tio(ci->md);
-		tio->io = ci->io;
-		tio->ti = ti;
-		memset(&tio->info, 0, sizeof(tio->info));
-		__map_bio(ti, clone, tio);
+				tio = alloc_tio(ci->md);
+				tio->io = ci->io;
+				tio->ti = ti;
+				memset(&tio->info, 0, sizeof(tio->info));
+			}
+
+			len = min(remaining, max);
+
+			clone = split_bvec(bio, ci->sector, ci->idx,
+					   bv->bv_offset + offset, len);
+
+			__map_bio(ti, clone, tio);
+
+			ci->sector += len;
+			ci->sector_count -= len;
+			offset += to_bytes(len);
+		} while (remaining -= len);
 
-		ci->sector += len;
-		ci->sector_count -= len;
 		ci->idx++;
 	}
 }
diff --git a/drivers/media/dvb/bt8xx/Makefile b/drivers/media/dvb/bt8xx/Makefile
index 9d197efb481..d188e4c670b 100644
--- a/drivers/media/dvb/bt8xx/Makefile
+++ b/drivers/media/dvb/bt8xx/Makefile
@@ -1,3 +1,3 @@
 obj-$(CONFIG_DVB_BT8XX) += bt878.o dvb-bt8xx.o dst.o dst_ca.o
 
-EXTRA_CFLAGS = -Idrivers/media/dvb/dvb-core/ -Idrivers/media/video/bt8xx -Idrivers/media/dvb/frontends
+EXTRA_CFLAGS = -Idrivers/media/dvb/dvb-core/ -Idrivers/media/video -Idrivers/media/dvb/frontends
diff --git a/drivers/net/mv643xx_eth.h b/drivers/net/mv643xx_eth.h
index 7754d1974b9..4262c1da6d4 100644
--- a/drivers/net/mv643xx_eth.h
+++ b/drivers/net/mv643xx_eth.h
@@ -42,13 +42,23 @@
 #define MAX_DESCS_PER_SKB	1
 #endif
 
+/*
+ * The MV643XX HW requires 8-byte alignment.  However, when I/O
+ * is non-cache-coherent, we need to ensure that the I/O buffers
+ * we use don't share cache lines with other data.
+ */
+#if defined(CONFIG_DMA_NONCOHERENT) || defined(CONFIG_NOT_COHERENT_CACHE)
+#define ETH_DMA_ALIGN		L1_CACHE_BYTES
+#else
+#define ETH_DMA_ALIGN		8
+#endif
+
 #define ETH_VLAN_HLEN		4
 #define ETH_FCS_LEN		4
-#define ETH_DMA_ALIGN		8	/* hw requires 8-byte alignment */
-#define ETH_HW_IP_ALIGN		2	/* hw aligns IP header */
+#define ETH_HW_IP_ALIGN		2		/* hw aligns IP header */
 #define ETH_WRAPPER_LEN		(ETH_HW_IP_ALIGN + ETH_HLEN + \
-				ETH_VLAN_HLEN + ETH_FCS_LEN)
-#define ETH_RX_SKB_SIZE		((dev->mtu + ETH_WRAPPER_LEN + 7) & ~0x7)
+					ETH_VLAN_HLEN + ETH_FCS_LEN)
+#define ETH_RX_SKB_SIZE		(dev->mtu + ETH_WRAPPER_LEN + ETH_DMA_ALIGN)
 
 #define ETH_RX_QUEUES_ENABLED	(1 << 0)	/* use only Q0 for receive */
 #define ETH_TX_QUEUES_ENABLED	(1 << 0)	/* use only Q0 for transmit */
diff --git a/drivers/net/pcnet32.c b/drivers/net/pcnet32.c
index 7e900572eaf..9595f74da93 100644
--- a/drivers/net/pcnet32.c
+++ b/drivers/net/pcnet32.c
@@ -22,12 +22,12 @@
  *************************************************************************/
 
 #define DRV_NAME	"pcnet32"
-#define DRV_VERSION	"1.31c"
-#define DRV_RELDATE	"01.Nov.2005"
+#define DRV_VERSION	"1.32"
+#define DRV_RELDATE	"18.Mar.2006"
 #define PFX		DRV_NAME ": "
 
-static const char * const version =
-DRV_NAME ".c:v" DRV_VERSION " " DRV_RELDATE " tsbogend@alpha.franken.de\n";
+static const char *const version =
+    DRV_NAME ".c:v" DRV_VERSION " " DRV_RELDATE " tsbogend@alpha.franken.de\n";
 
 #include <linux/module.h>
 #include <linux/kernel.h>
@@ -58,18 +58,23 @@ DRV_NAME ".c:v" DRV_VERSION " " DRV_RELDATE " tsbogend@alpha.franken.de\n";
  * PCI device identifiers for "new style" Linux PCI Device Drivers
  */
 static struct pci_device_id pcnet32_pci_tbl[] = {
-    { PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_LANCE_HOME, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0 },
-    { PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_LANCE, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0 },
-    /*
-     * Adapters that were sold with IBM's RS/6000 or pSeries hardware have
-     * the incorrect vendor id.
-     */
-    { PCI_VENDOR_ID_TRIDENT, PCI_DEVICE_ID_AMD_LANCE, PCI_ANY_ID, PCI_ANY_ID,
-	    PCI_CLASS_NETWORK_ETHERNET << 8, 0xffff00, 0 },
-    { 0, }
+	{ PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_LANCE_HOME,
+	  PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0},
+	{ PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_LANCE,
+	  PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0},
+
+	/*
+	 * Adapters that were sold with IBM's RS/6000 or pSeries hardware have
+	 * the incorrect vendor id.
+	 */
+	{ PCI_VENDOR_ID_TRIDENT, PCI_DEVICE_ID_AMD_LANCE,
+	  PCI_ANY_ID, PCI_ANY_ID,
+	  PCI_CLASS_NETWORK_ETHERNET << 8, 0xffff00, 0},
+
+	{ }	/* terminate list */
 };
 
-MODULE_DEVICE_TABLE (pci, pcnet32_pci_tbl);
+MODULE_DEVICE_TABLE(pci, pcnet32_pci_tbl);
 
 static int cards_found;
 
@@ -77,13 +82,11 @@ static int cards_found;
  * VLB I/O addresses
  */
 static unsigned int pcnet32_portlist[] __initdata =
-	{ 0x300, 0x320, 0x340, 0x360, 0 };
-
-
+    { 0x300, 0x320, 0x340, 0x360, 0 };
 
 static int pcnet32_debug = 0;
-static int tx_start = 1; /* Mapping -- 0:20, 1:64, 2:128, 3:~220 (depends on chip vers) */
-static int pcnet32vlb;	 /* check for VLB cards ? */
+static int tx_start = 1;	/* Mapping -- 0:20, 1:64, 2:128, 3:~220 (depends on chip vers) */
+static int pcnet32vlb;		/* check for VLB cards ? */
 
 static struct net_device *pcnet32_dev;
 
@@ -110,32 +113,34 @@ static int rx_copybreak = 200;
  * to internal options
  */
 static const unsigned char options_mapping[] = {
-    PCNET32_PORT_ASEL,			   /*  0 Auto-select	  */
-    PCNET32_PORT_AUI,			   /*  1 BNC/AUI	  */
-    PCNET32_PORT_AUI,			   /*  2 AUI/BNC	  */
-    PCNET32_PORT_ASEL,			   /*  3 not supported	  */
-    PCNET32_PORT_10BT | PCNET32_PORT_FD,   /*  4 10baseT-FD	  */
-    PCNET32_PORT_ASEL,			   /*  5 not supported	  */
-    PCNET32_PORT_ASEL,			   /*  6 not supported	  */
-    PCNET32_PORT_ASEL,			   /*  7 not supported	  */
-    PCNET32_PORT_ASEL,			   /*  8 not supported	  */
-    PCNET32_PORT_MII,			   /*  9 MII 10baseT	  */
-    PCNET32_PORT_MII | PCNET32_PORT_FD,	   /* 10 MII 10baseT-FD	  */
-    PCNET32_PORT_MII,			   /* 11 MII (autosel)	  */
-    PCNET32_PORT_10BT,			   /* 12 10BaseT	  */
-    PCNET32_PORT_MII | PCNET32_PORT_100,   /* 13 MII 100BaseTx	  */
-    PCNET32_PORT_MII | PCNET32_PORT_100 | PCNET32_PORT_FD, /* 14 MII 100BaseTx-FD */
-    PCNET32_PORT_ASEL			   /* 15 not supported	  */
+	PCNET32_PORT_ASEL,			/*  0 Auto-select      */
+	PCNET32_PORT_AUI,			/*  1 BNC/AUI          */
+	PCNET32_PORT_AUI,			/*  2 AUI/BNC          */
+	PCNET32_PORT_ASEL,			/*  3 not supported    */
+	PCNET32_PORT_10BT | PCNET32_PORT_FD,	/*  4 10baseT-FD       */
+	PCNET32_PORT_ASEL,			/*  5 not supported    */
+	PCNET32_PORT_ASEL,			/*  6 not supported    */
+	PCNET32_PORT_ASEL,			/*  7 not supported    */
+	PCNET32_PORT_ASEL,			/*  8 not supported    */
+	PCNET32_PORT_MII,			/*  9 MII 10baseT      */
+	PCNET32_PORT_MII | PCNET32_PORT_FD,	/* 10 MII 10baseT-FD   */
+	PCNET32_PORT_MII,			/* 11 MII (autosel)    */
+	PCNET32_PORT_10BT,			/* 12 10BaseT          */
+	PCNET32_PORT_MII | PCNET32_PORT_100,	/* 13 MII 100BaseTx    */
+						/* 14 MII 100BaseTx-FD */
+	PCNET32_PORT_MII | PCNET32_PORT_100 | PCNET32_PORT_FD,
+	PCNET32_PORT_ASEL			/* 15 not supported    */
 };
 
 static const char pcnet32_gstrings_test[][ETH_GSTRING_LEN] = {
-    "Loopback test  (offline)"
+	"Loopback test  (offline)"
 };
+
 #define PCNET32_TEST_LEN (sizeof(pcnet32_gstrings_test) / ETH_GSTRING_LEN)
 
-#define PCNET32_NUM_REGS 168
+#define PCNET32_NUM_REGS 136
 
-#define MAX_UNITS 8	/* More are supported, limit only on options */
+#define MAX_UNITS 8		/* More are supported, limit only on options */
 static int options[MAX_UNITS];
 static int full_duplex[MAX_UNITS];
 static int homepna[MAX_UNITS];
@@ -151,124 +156,6 @@ static int homepna[MAX_UNITS];
  */
 
 /*
- * History:
- * v0.01:  Initial version
- *	   only tested on Alpha Noname Board
- * v0.02:  changed IRQ handling for new interrupt scheme (dev_id)
- *	   tested on a ASUS SP3G
- * v0.10:  fixed an odd problem with the 79C974 in a Compaq Deskpro XL
- *	   looks like the 974 doesn't like stopping and restarting in a
- *	   short period of time; now we do a reinit of the lance; the
- *	   bug was triggered by doing ifconfig eth0 <ip> broadcast <addr>
- *	   and hangs the machine (thanks to Klaus Liedl for debugging)
- * v0.12:  by suggestion from Donald Becker: Renamed driver to pcnet32,
- *	   made it standalone (no need for lance.c)
- * v0.13:  added additional PCI detecting for special PCI devices (Compaq)
- * v0.14:  stripped down additional PCI probe (thanks to David C Niemi
- *	   and sveneric@xs4all.nl for testing this on their Compaq boxes)
- * v0.15:  added 79C965 (VLB) probe
- *	   added interrupt sharing for PCI chips
- * v0.16:  fixed set_multicast_list on Alpha machines
- * v0.17:  removed hack from dev.c; now pcnet32 uses ethif_probe in Space.c
- * v0.19:  changed setting of autoselect bit
- * v0.20:  removed additional Compaq PCI probe; there is now a working one
- *	   in arch/i386/bios32.c
- * v0.21:  added endian conversion for ppc, from work by cort@cs.nmt.edu
- * v0.22:  added printing of status to ring dump
- * v0.23:  changed enet_statistics to net_devive_stats
- * v0.90:  added multicast filter
- *	   added module support
- *	   changed irq probe to new style
- *	   added PCnetFast chip id
- *	   added fix for receive stalls with Intel saturn chipsets
- *	   added in-place rx skbs like in the tulip driver
- *	   minor cleanups
- * v0.91:  added PCnetFast+ chip id
- *	   back port to 2.0.x
- * v1.00:  added some stuff from Donald Becker's 2.0.34 version
- *	   added support for byte counters in net_dev_stats
- * v1.01:  do ring dumps, only when debugging the driver
- *	   increased the transmit timeout
- * v1.02:  fixed memory leak in pcnet32_init_ring()
- * v1.10:  workaround for stopped transmitter
- *	   added port selection for modules
- *	   detect special T1/E1 WAN card and setup port selection
- * v1.11:  fixed wrong checking of Tx errors
- * v1.20:  added check of return value kmalloc (cpeterso@cs.washington.edu)
- *	   added save original kmalloc addr for freeing (mcr@solidum.com)
- *	   added support for PCnetHome chip (joe@MIT.EDU)
- *	   rewritten PCI card detection
- *	   added dwio mode to get driver working on some PPC machines
- * v1.21:  added mii selection and mii ioctl
- * v1.22:  changed pci scanning code to make PPC people happy
- *	   fixed switching to 32bit mode in pcnet32_open() (thanks
- *	   to Michael Richard <mcr@solidum.com> for noticing this one)
- *	   added sub vendor/device id matching (thanks again to
- *	   Michael Richard <mcr@solidum.com>)
- *	   added chip id for 79c973/975 (thanks to Zach Brown <zab@zabbo.net>)
- * v1.23   fixed small bug, when manual selecting MII speed/duplex
- * v1.24   Applied Thomas' patch to use TxStartPoint and thus decrease TxFIFO
- *	   underflows.	Added tx_start_pt module parameter. Increased
- *	   TX_RING_SIZE from 16 to 32.	Added #ifdef'd code to use DXSUFLO
- *	   for FAST[+] chipsets. <kaf@fc.hp.com>
- * v1.24ac Added SMP spinlocking - Alan Cox <alan@redhat.com>
- * v1.25kf Added No Interrupt on successful Tx for some Tx's <kaf@fc.hp.com>
- * v1.26   Converted to pci_alloc_consistent, Jamey Hicks / George France
- *                                           <jamey@crl.dec.com>
- * -	   Fixed a few bugs, related to running the controller in 32bit mode.
- *	   23 Oct, 2000.  Carsten Langgaard, carstenl@mips.com
- *	   Copyright (C) 2000 MIPS Technologies, Inc.  All rights reserved.
- * v1.26p  Fix oops on rmmod+insmod; plug i/o resource leak - Paul Gortmaker
- * v1.27   improved CSR/PROM address detection, lots of cleanups,
- * 	   new pcnet32vlb module option, HP-PARISC support,
- * 	   added module parameter descriptions,
- * 	   initial ethtool support - Helge Deller <deller@gmx.de>
- * v1.27a  Sun Feb 10 2002 Go Taniguchi <go@turbolinux.co.jp>
- *	   use alloc_etherdev and register_netdev
- *	   fix pci probe not increment cards_found
- *	   FD auto negotiate error workaround for xSeries250
- *	   clean up and using new mii module
- * v1.27b  Sep 30 2002 Kent Yoder <yoder1@us.ibm.com>
- * 	   Added timer for cable connection state changes.
- * v1.28   20 Feb 2004 Don Fry <brazilnut@us.ibm.com>
- *	   Jon Mason <jonmason@us.ibm.com>, Chinmay Albal <albal@in.ibm.com>
- *	   Now uses ethtool_ops, netif_msg_* and generic_mii_ioctl.
- *	   Fixes bogus 'Bus master arbitration failure', pci_[un]map_single
- *	   length errors, and transmit hangs.  Cleans up after errors in open.
- *	   Jim Lewis <jklewis@us.ibm.com> added ethernet loopback test.
- *	   Thomas Munck Steenholdt <tmus@tmus.dk> non-mii ioctl corrections.
- * v1.29   6 Apr 2004 Jim Lewis <jklewis@us.ibm.com> added physical
- *	   identification code (blink led's) and register dump.
- *	   Don Fry added timer for 971/972 so skbufs don't remain on tx ring
- *	   forever.
- * v1.30   18 May 2004 Don Fry removed timer and Last Transmit Interrupt
- *	   (ltint) as they added complexity and didn't give good throughput.
- * v1.30a  22 May 2004 Don Fry limit frames received during interrupt.
- * v1.30b  24 May 2004 Don Fry fix bogus tx carrier errors with 79c973,
- *	   assisted by Bruce Penrod <bmpenrod@endruntechnologies.com>.
- * v1.30c  25 May 2004 Don Fry added netif_wake_queue after pcnet32_restart.
- * v1.30d  01 Jun 2004 Don Fry discard oversize rx packets.
- * v1.30e  11 Jun 2004 Don Fry recover after fifo error and rx hang.
- * v1.30f  16 Jun 2004 Don Fry cleanup IRQ to allow 0 and 1 for PCI,
- * 	   expanding on suggestions from Ralf Baechle <ralf@linux-mips.org>,
- * 	   and Brian Murphy <brian@murphy.dk>.
- * v1.30g  22 Jun 2004 Patrick Simmons <psimmons@flash.net> added option
- *	   homepna for selecting HomePNA mode for PCNet/Home 79C978.
- * v1.30h  24 Jun 2004 Don Fry correctly select auto, speed, duplex in bcr32.
- * v1.30i  28 Jun 2004 Don Fry change to use module_param.
- * v1.30j  29 Apr 2005 Don Fry fix skb/map leak with loopback test.
- * v1.31   02 Sep 2005 Hubert WS Lin <wslin@tw.ibm.c0m> added set_ringparam().
- * v1.31a  12 Sep 2005 Hubert WS Lin <wslin@tw.ibm.c0m> set min ring size to 4
- *	   to allow loopback test to work unchanged.
- * v1.31b  06 Oct 2005 Don Fry changed alloc_ring to show name of device
- *	   if allocation fails
- * v1.31c  01 Nov 2005 Don Fry Allied Telesyn 2700/2701 FX are 100Mbit only.
- *	   Force 100Mbit FD if Auto (ASEL) is selected.
- *	   See Bugzilla 2669 and 4551.
- */
-
-
-/*
  * Set the number of Tx and Rx buffers, using Log_2(# buffers).
  * Reasonable default values are 4 Tx buffers, and 16 Rx buffers.
  * That translates to 2 (4 == 2^^2) and 4 (16 == 2^^4).
@@ -303,42 +190,42 @@ static int homepna[MAX_UNITS];
 
 /* The PCNET32 Rx and Tx ring descriptors. */
 struct pcnet32_rx_head {
-    u32 base;
-    s16 buf_length;
-    s16 status;
-    u32 msg_length;
-    u32 reserved;
+	u32	base;
+	s16	buf_length;
+	s16	status;
+	u32	msg_length;
+	u32	reserved;
 };
 
 struct pcnet32_tx_head {
-    u32 base;
-    s16 length;
-    s16 status;
-    u32 misc;
-    u32 reserved;
+	u32	base;
+	s16	length;
+	s16	status;
+	u32	misc;
+	u32	reserved;
 };
 
 /* The PCNET32 32-Bit initialization block, described in databook. */
 struct pcnet32_init_block {
-    u16 mode;
-    u16 tlen_rlen;
-    u8	phys_addr[6];
-    u16 reserved;
-    u32 filter[2];
-    /* Receive and transmit ring base, along with extra bits. */
-    u32 rx_ring;
-    u32 tx_ring;
+	u16	mode;
+	u16	tlen_rlen;
+	u8	phys_addr[6];
+	u16	reserved;
+	u32	filter[2];
+	/* Receive and transmit ring base, along with extra bits. */
+	u32	rx_ring;
+	u32	tx_ring;
 };
 
 /* PCnet32 access functions */
 struct pcnet32_access {
-    u16 (*read_csr)(unsigned long, int);
-    void (*write_csr)(unsigned long, int, u16);
-    u16 (*read_bcr)(unsigned long, int);
-    void (*write_bcr)(unsigned long, int, u16);
-    u16 (*read_rap)(unsigned long);
-    void (*write_rap)(unsigned long, u16);
-    void (*reset)(unsigned long);
+	u16	(*read_csr) (unsigned long, int);
+	void	(*write_csr) (unsigned long, int, u16);
+	u16	(*read_bcr) (unsigned long, int);
+	void	(*write_bcr) (unsigned long, int, u16);
+	u16	(*read_rap) (unsigned long);
+	void	(*write_rap) (unsigned long, u16);
+	void	(*reset) (unsigned long);
 };
 
 /*
@@ -346,760 +233,794 @@ struct pcnet32_access {
  * so the structure should be allocated using pci_alloc_consistent().
  */
 struct pcnet32_private {
-    struct pcnet32_init_block init_block;
-    /* The Tx and Rx ring entries must be aligned on 16-byte boundaries in 32bit mode. */
-    struct pcnet32_rx_head    *rx_ring;
-    struct pcnet32_tx_head    *tx_ring;
-    dma_addr_t		dma_addr;	/* DMA address of beginning of this
-					   object, returned by
-					   pci_alloc_consistent */
-    struct pci_dev	*pci_dev;	/* Pointer to the associated pci device
-					   structure */
-    const char		*name;
-    /* The saved address of a sent-in-place packet/buffer, for skfree(). */
-    struct sk_buff	**tx_skbuff;
-    struct sk_buff	**rx_skbuff;
-    dma_addr_t		*tx_dma_addr;
-    dma_addr_t		*rx_dma_addr;
-    struct pcnet32_access	a;
-    spinlock_t		lock;		/* Guard lock */
-    unsigned int	cur_rx, cur_tx;	/* The next free ring entry */
-    unsigned int	rx_ring_size;	/* current rx ring size */
-    unsigned int	tx_ring_size;	/* current tx ring size */
-    unsigned int	rx_mod_mask;	/* rx ring modular mask */
-    unsigned int	tx_mod_mask;	/* tx ring modular mask */
-    unsigned short	rx_len_bits;
-    unsigned short	tx_len_bits;
-    dma_addr_t		rx_ring_dma_addr;
-    dma_addr_t		tx_ring_dma_addr;
-    unsigned int	dirty_rx, dirty_tx; /* The ring entries to be free()ed. */
-    struct net_device_stats stats;
-    char		tx_full;
-    int			options;
-    unsigned int	shared_irq:1,	/* shared irq possible */
-			dxsuflo:1,	/* disable transmit stop on uflo */
-			mii:1;		/* mii port available */
-    struct net_device	*next;
-    struct mii_if_info	mii_if;
-    struct timer_list	watchdog_timer;
-    struct timer_list	blink_timer;
-    u32			msg_enable;	/* debug message level */
+	struct pcnet32_init_block init_block;
+	/* The Tx and Rx ring entries must be aligned on 16-byte boundaries in 32bit mode. */
+	struct pcnet32_rx_head	*rx_ring;
+	struct pcnet32_tx_head	*tx_ring;
+	dma_addr_t		dma_addr;/* DMA address of beginning of this
+				   object, returned by pci_alloc_consistent */
+	struct pci_dev		*pci_dev;
+	const char		*name;
+	/* The saved address of a sent-in-place packet/buffer, for skfree(). */
+	struct sk_buff		**tx_skbuff;
+	struct sk_buff		**rx_skbuff;
+	dma_addr_t		*tx_dma_addr;
+	dma_addr_t		*rx_dma_addr;
+	struct pcnet32_access	a;
+	spinlock_t		lock;		/* Guard lock */
+	unsigned int		cur_rx, cur_tx;	/* The next free ring entry */
+	unsigned int		rx_ring_size;	/* current rx ring size */
+	unsigned int		tx_ring_size;	/* current tx ring size */
+	unsigned int		rx_mod_mask;	/* rx ring modular mask */
+	unsigned int		tx_mod_mask;	/* tx ring modular mask */
+	unsigned short		rx_len_bits;
+	unsigned short		tx_len_bits;
+	dma_addr_t		rx_ring_dma_addr;
+	dma_addr_t		tx_ring_dma_addr;
+	unsigned int		dirty_rx,	/* ring entries to be freed. */
+				dirty_tx;
+
+	struct net_device_stats	stats;
+	char			tx_full;
+	char			phycount;	/* number of phys found */
+	int			options;
+	unsigned int		shared_irq:1,	/* shared irq possible */
+				dxsuflo:1,   /* disable transmit stop on uflo */
+				mii:1;		/* mii port available */
+	struct net_device	*next;
+	struct mii_if_info	mii_if;
+	struct timer_list	watchdog_timer;
+	struct timer_list	blink_timer;
+	u32			msg_enable;	/* debug message level */
+
+	/* each bit indicates an available PHY */
+	u32			phymask;
 };
 
 static void pcnet32_probe_vlbus(void);
-static int  pcnet32_probe_pci(struct pci_dev *, const struct pci_device_id *);
-static int  pcnet32_probe1(unsigned long, int, struct pci_dev *);
-static int  pcnet32_open(struct net_device *);
-static int  pcnet32_init_ring(struct net_device *);
-static int  pcnet32_start_xmit(struct sk_buff *, struct net_device *);
-static int  pcnet32_rx(struct net_device *);
-static void pcnet32_tx_timeout (struct net_device *dev);
+static int pcnet32_probe_pci(struct pci_dev *, const struct pci_device_id *);
+static int pcnet32_probe1(unsigned long, int, struct pci_dev *);
+static int pcnet32_open(struct net_device *);
+static int pcnet32_init_ring(struct net_device *);
+static int pcnet32_start_xmit(struct sk_buff *, struct net_device *);
+static int pcnet32_rx(struct net_device *);
+static void pcnet32_tx_timeout(struct net_device *dev);
 static irqreturn_t pcnet32_interrupt(int, void *, struct pt_regs *);
-static int  pcnet32_close(struct net_device *);
+static int pcnet32_close(struct net_device *);
 static struct net_device_stats *pcnet32_get_stats(struct net_device *);
 static void pcnet32_load_multicast(struct net_device *dev);
 static void pcnet32_set_multicast_list(struct net_device *);
-static int  pcnet32_ioctl(struct net_device *, struct ifreq *, int);
+static int pcnet32_ioctl(struct net_device *, struct ifreq *, int);
 static void pcnet32_watchdog(struct net_device *);
 static int mdio_read(struct net_device *dev, int phy_id, int reg_num);
-static void mdio_write(struct net_device *dev, int phy_id, int reg_num, int val);
+static void mdio_write(struct net_device *dev, int phy_id, int reg_num,
+		       int val);
 static void pcnet32_restart(struct net_device *dev, unsigned int csr0_bits);
 static void pcnet32_ethtool_test(struct net_device *dev,
-	struct ethtool_test *eth_test, u64 *data);
-static int pcnet32_loopback_test(struct net_device *dev, uint64_t *data1);
+				 struct ethtool_test *eth_test, u64 * data);
+static int pcnet32_loopback_test(struct net_device *dev, uint64_t * data1);
 static int pcnet32_phys_id(struct net_device *dev, u32 data);
 static void pcnet32_led_blink_callback(struct net_device *dev);
 static int pcnet32_get_regs_len(struct net_device *dev);
 static void pcnet32_get_regs(struct net_device *dev, struct ethtool_regs *regs,
-	void *ptr);
+			     void *ptr);
 static void pcnet32_purge_tx_ring(struct net_device *dev);
 static int pcnet32_alloc_ring(struct net_device *dev, char *name);
 static void pcnet32_free_ring(struct net_device *dev);
-
+static void pcnet32_check_media(struct net_device *dev, int verbose);
 
 enum pci_flags_bit {
-    PCI_USES_IO=1, PCI_USES_MEM=2, PCI_USES_MASTER=4,
-    PCI_ADDR0=0x10<<0, PCI_ADDR1=0x10<<1, PCI_ADDR2=0x10<<2, PCI_ADDR3=0x10<<3,
+	PCI_USES_IO = 1, PCI_USES_MEM = 2, PCI_USES_MASTER = 4,
+	PCI_ADDR0 = 0x10 << 0, PCI_ADDR1 = 0x10 << 1, PCI_ADDR2 =
+	    0x10 << 2, PCI_ADDR3 = 0x10 << 3,
 };
 
-
-static u16 pcnet32_wio_read_csr (unsigned long addr, int index)
+static u16 pcnet32_wio_read_csr(unsigned long addr, int index)
 {
-    outw (index, addr+PCNET32_WIO_RAP);
-    return inw (addr+PCNET32_WIO_RDP);
+	outw(index, addr + PCNET32_WIO_RAP);
+	return inw(addr + PCNET32_WIO_RDP);
 }
 
-static void pcnet32_wio_write_csr (unsigned long addr, int index, u16 val)
+static void pcnet32_wio_write_csr(unsigned long addr, int index, u16 val)
 {
-    outw (index, addr+PCNET32_WIO_RAP);
-    outw (val, addr+PCNET32_WIO_RDP);
+	outw(index, addr + PCNET32_WIO_RAP);
+	outw(val, addr + PCNET32_WIO_RDP);
 }
 
-static u16 pcnet32_wio_read_bcr (unsigned long addr, int index)
+static u16 pcnet32_wio_read_bcr(unsigned long addr, int index)
 {
-    outw (index, addr+PCNET32_WIO_RAP);
-    return inw (addr+PCNET32_WIO_BDP);
+	outw(index, addr + PCNET32_WIO_RAP);
+	return inw(addr + PCNET32_WIO_BDP);
 }
 
-static void pcnet32_wio_write_bcr (unsigned long addr, int index, u16 val)
+static void pcnet32_wio_write_bcr(unsigned long addr, int index, u16 val)
 {
-    outw (index, addr+PCNET32_WIO_RAP);
-    outw (val, addr+PCNET32_WIO_BDP);
+	outw(index, addr + PCNET32_WIO_RAP);
+	outw(val, addr + PCNET32_WIO_BDP);
 }
 
-static u16 pcnet32_wio_read_rap (unsigned long addr)
+static u16 pcnet32_wio_read_rap(unsigned long addr)
 {
-    return inw (addr+PCNET32_WIO_RAP);
+	return inw(addr + PCNET32_WIO_RAP);
 }
 
-static void pcnet32_wio_write_rap (unsigned long addr, u16 val)
+static void pcnet32_wio_write_rap(unsigned long addr, u16 val)
 {
-    outw (val, addr+PCNET32_WIO_RAP);
+	outw(val, addr + PCNET32_WIO_RAP);
 }
 
-static void pcnet32_wio_reset (unsigned long addr)
+static void pcnet32_wio_reset(unsigned long addr)
 {
-    inw (addr+PCNET32_WIO_RESET);
+	inw(addr + PCNET32_WIO_RESET);
 }
 
-static int pcnet32_wio_check (unsigned long addr)
+static int pcnet32_wio_check(unsigned long addr)
 {
-    outw (88, addr+PCNET32_WIO_RAP);
-    return (inw (addr+PCNET32_WIO_RAP) == 88);
+	outw(88, addr + PCNET32_WIO_RAP);
+	return (inw(addr + PCNET32_WIO_RAP) == 88);
 }
 
 static struct pcnet32_access pcnet32_wio = {
-    .read_csr	= pcnet32_wio_read_csr,
-    .write_csr	= pcnet32_wio_write_csr,
-    .read_bcr	= pcnet32_wio_read_bcr,
-    .write_bcr	= pcnet32_wio_write_bcr,
-    .read_rap	= pcnet32_wio_read_rap,
-    .write_rap	= pcnet32_wio_write_rap,
-    .reset	= pcnet32_wio_reset
+	.read_csr = pcnet32_wio_read_csr,
+	.write_csr = pcnet32_wio_write_csr,
+	.read_bcr = pcnet32_wio_read_bcr,
+	.write_bcr = pcnet32_wio_write_bcr,
+	.read_rap = pcnet32_wio_read_rap,
+	.write_rap = pcnet32_wio_write_rap,
+	.reset = pcnet32_wio_reset
 };
 
-static u16 pcnet32_dwio_read_csr (unsigned long addr, int index)
+static u16 pcnet32_dwio_read_csr(unsigned long addr, int index)
 {
-    outl (index, addr+PCNET32_DWIO_RAP);
-    return (inl (addr+PCNET32_DWIO_RDP) & 0xffff);
+	outl(index, addr + PCNET32_DWIO_RAP);
+	return (inl(addr + PCNET32_DWIO_RDP) & 0xffff);
 }
 
-static void pcnet32_dwio_write_csr (unsigned long addr, int index, u16 val)
+static void pcnet32_dwio_write_csr(unsigned long addr, int index, u16 val)
 {
-    outl (index, addr+PCNET32_DWIO_RAP);
-    outl (val, addr+PCNET32_DWIO_RDP);
+	outl(index, addr + PCNET32_DWIO_RAP);
+	outl(val, addr + PCNET32_DWIO_RDP);
 }
 
-static u16 pcnet32_dwio_read_bcr (unsigned long addr, int index)
+static u16 pcnet32_dwio_read_bcr(unsigned long addr, int index)
 {
-    outl (index, addr+PCNET32_DWIO_RAP);
-    return (inl (addr+PCNET32_DWIO_BDP) & 0xffff);
+	outl(index, addr + PCNET32_DWIO_RAP);
+	return (inl(addr + PCNET32_DWIO_BDP) & 0xffff);
 }
 
-static void pcnet32_dwio_write_bcr (unsigned long addr, int index, u16 val)
+static void pcnet32_dwio_write_bcr(unsigned long addr, int index, u16 val)
 {
-    outl (index, addr+PCNET32_DWIO_RAP);
-    outl (val, addr+PCNET32_DWIO_BDP);
+	outl(index, addr + PCNET32_DWIO_RAP);
+	outl(val, addr + PCNET32_DWIO_BDP);
 }
 
-static u16 pcnet32_dwio_read_rap (unsigned long addr)
+static u16 pcnet32_dwio_read_rap(unsigned long addr)
 {
-    return (inl (addr+PCNET32_DWIO_RAP) & 0xffff);
+	return (inl(addr + PCNET32_DWIO_RAP) & 0xffff);
 }
 
-static void pcnet32_dwio_write_rap (unsigned long addr, u16 val)
+static void pcnet32_dwio_write_rap(unsigned long addr, u16 val)
 {
-    outl (val, addr+PCNET32_DWIO_RAP);
+	outl(val, addr + PCNET32_DWIO_RAP);
 }
 
-static void pcnet32_dwio_reset (unsigned long addr)
+static void pcnet32_dwio_reset(unsigned long addr)
 {
-    inl (addr+PCNET32_DWIO_RESET);
+	inl(addr + PCNET32_DWIO_RESET);
 }
 
-static int pcnet32_dwio_check (unsigned long addr)
+static int pcnet32_dwio_check(unsigned long addr)
 {
-    outl (88, addr+PCNET32_DWIO_RAP);
-    return ((inl (addr+PCNET32_DWIO_RAP) & 0xffff) == 88);
+	outl(88, addr + PCNET32_DWIO_RAP);
+	return ((inl(addr + PCNET32_DWIO_RAP) & 0xffff) == 88);
 }
 
 static struct pcnet32_access pcnet32_dwio = {
-    .read_csr	= pcnet32_dwio_read_csr,
-    .write_csr	= pcnet32_dwio_write_csr,
-    .read_bcr	= pcnet32_dwio_read_bcr,
-    .write_bcr	= pcnet32_dwio_write_bcr,
-    .read_rap	= pcnet32_dwio_read_rap,
-    .write_rap	= pcnet32_dwio_write_rap,
-    .reset	= pcnet32_dwio_reset
+	.read_csr = pcnet32_dwio_read_csr,
+	.write_csr = pcnet32_dwio_write_csr,
+	.read_bcr = pcnet32_dwio_read_bcr,
+	.write_bcr = pcnet32_dwio_write_bcr,
+	.read_rap = pcnet32_dwio_read_rap,
+	.write_rap = pcnet32_dwio_write_rap,
+	.reset = pcnet32_dwio_reset
 };
 
 #ifdef CONFIG_NET_POLL_CONTROLLER
 static void pcnet32_poll_controller(struct net_device *dev)
 {
-    disable_irq(dev->irq);
-    pcnet32_interrupt(0, dev, NULL);
-    enable_irq(dev->irq);
+	disable_irq(dev->irq);
+	pcnet32_interrupt(0, dev, NULL);
+	enable_irq(dev->irq);
 }
 #endif
 
-
 static int pcnet32_get_settings(struct net_device *dev, struct ethtool_cmd *cmd)
 {
-    struct pcnet32_private *lp = dev->priv;
-    unsigned long flags;
-    int r = -EOPNOTSUPP;
-
-    if (lp->mii) {
-	spin_lock_irqsave(&lp->lock, flags);
-	mii_ethtool_gset(&lp->mii_if, cmd);
-	spin_unlock_irqrestore(&lp->lock, flags);
-	r = 0;
-    }
-    return r;
+	struct pcnet32_private *lp = dev->priv;
+	unsigned long flags;
+	int r = -EOPNOTSUPP;
+
+	if (lp->mii) {
+		spin_lock_irqsave(&lp->lock, flags);
+		mii_ethtool_gset(&lp->mii_if, cmd);
+		spin_unlock_irqrestore(&lp->lock, flags);
+		r = 0;
+	}
+	return r;
 }
 
 static int pcnet32_set_settings(struct net_device *dev, struct ethtool_cmd *cmd)
 {
-    struct pcnet32_private *lp = dev->priv;
-    unsigned long flags;
-    int r = -EOPNOTSUPP;
+	struct pcnet32_private *lp = dev->priv;
+	unsigned long flags;
+	int r = -EOPNOTSUPP;
 
-    if (lp->mii) {
-	spin_lock_irqsave(&lp->lock, flags);
-	r = mii_ethtool_sset(&lp->mii_if, cmd);
-	spin_unlock_irqrestore(&lp->lock, flags);
-    }
-    return r;
+	if (lp->mii) {
+		spin_lock_irqsave(&lp->lock, flags);
+		r = mii_ethtool_sset(&lp->mii_if, cmd);
+		spin_unlock_irqrestore(&lp->lock, flags);
+	}
+	return r;
 }
 
-static void pcnet32_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info)
+static void pcnet32_get_drvinfo(struct net_device *dev,
+				struct ethtool_drvinfo *info)
 {
-    struct pcnet32_private *lp = dev->priv;
-
-    strcpy (info->driver, DRV_NAME);
-    strcpy (info->version, DRV_VERSION);
-    if (lp->pci_dev)
-	strcpy (info->bus_info, pci_name(lp->pci_dev));
-    else
-	sprintf(info->bus_info, "VLB 0x%lx", dev->base_addr);
+	struct pcnet32_private *lp = dev->priv;
+
+	strcpy(info->driver, DRV_NAME);
+	strcpy(info->version, DRV_VERSION);
+	if (lp->pci_dev)
+		strcpy(info->bus_info, pci_name(lp->pci_dev));
+	else
+		sprintf(info->bus_info, "VLB 0x%lx", dev->base_addr);
 }
 
 static u32 pcnet32_get_link(struct net_device *dev)
 {
-    struct pcnet32_private *lp = dev->priv;
-    unsigned long flags;
-    int r;
-
-    spin_lock_irqsave(&lp->lock, flags);
-    if (lp->mii) {
-	r = mii_link_ok(&lp->mii_if);
-    } else {
-	ulong ioaddr = dev->base_addr;	/* card base I/O address */
-	r = (lp->a.read_bcr(ioaddr, 4) != 0xc0);
-    }
-    spin_unlock_irqrestore(&lp->lock, flags);
+	struct pcnet32_private *lp = dev->priv;
+	unsigned long flags;
+	int r;
 
-    return r;
+	spin_lock_irqsave(&lp->lock, flags);
+	if (lp->mii) {
+		r = mii_link_ok(&lp->mii_if);
+	} else {
+		ulong ioaddr = dev->base_addr;	/* card base I/O address */
+		r = (lp->a.read_bcr(ioaddr, 4) != 0xc0);
+	}
+	spin_unlock_irqrestore(&lp->lock, flags);
+
+	return r;
 }
 
 static u32 pcnet32_get_msglevel(struct net_device *dev)
 {
-    struct pcnet32_private *lp = dev->priv;
-    return lp->msg_enable;
+	struct pcnet32_private *lp = dev->priv;
+	return lp->msg_enable;
 }
 
 static void pcnet32_set_msglevel(struct net_device *dev, u32 value)
 {
-    struct pcnet32_private *lp = dev->priv;
-    lp->msg_enable = value;
+	struct pcnet32_private *lp = dev->priv;
+	lp->msg_enable = value;
 }
 
 static int pcnet32_nway_reset(struct net_device *dev)
 {
-    struct pcnet32_private *lp = dev->priv;
-    unsigned long flags;
-    int r = -EOPNOTSUPP;
+	struct pcnet32_private *lp = dev->priv;
+	unsigned long flags;
+	int r = -EOPNOTSUPP;
 
-    if (lp->mii) {
-	spin_lock_irqsave(&lp->lock, flags);
-	r = mii_nway_restart(&lp->mii_if);
-	spin_unlock_irqrestore(&lp->lock, flags);
-    }
-    return r;
+	if (lp->mii) {
+		spin_lock_irqsave(&lp->lock, flags);
+		r = mii_nway_restart(&lp->mii_if);
+		spin_unlock_irqrestore(&lp->lock, flags);
+	}
+	return r;
 }
 
-static void pcnet32_get_ringparam(struct net_device *dev, struct ethtool_ringparam *ering)
+static void pcnet32_get_ringparam(struct net_device *dev,
+				  struct ethtool_ringparam *ering)
 {
-    struct pcnet32_private *lp = dev->priv;
+	struct pcnet32_private *lp = dev->priv;
 
-    ering->tx_max_pending = TX_MAX_RING_SIZE - 1;
-    ering->tx_pending = lp->tx_ring_size - 1;
-    ering->rx_max_pending = RX_MAX_RING_SIZE - 1;
-    ering->rx_pending = lp->rx_ring_size - 1;
+	ering->tx_max_pending = TX_MAX_RING_SIZE - 1;
+	ering->tx_pending = lp->tx_ring_size - 1;
+	ering->rx_max_pending = RX_MAX_RING_SIZE - 1;
+	ering->rx_pending = lp->rx_ring_size - 1;
 }
 
-static int pcnet32_set_ringparam(struct net_device *dev, struct ethtool_ringparam *ering)
+static int pcnet32_set_ringparam(struct net_device *dev,
+				 struct ethtool_ringparam *ering)
 {
-    struct pcnet32_private *lp = dev->priv;
-    unsigned long flags;
-    int i;
-
-    if (ering->rx_mini_pending || ering->rx_jumbo_pending)
-	return -EINVAL;
-
-    if (netif_running(dev))
-	pcnet32_close(dev);
-
-    spin_lock_irqsave(&lp->lock, flags);
-    pcnet32_free_ring(dev);
-    lp->tx_ring_size = min(ering->tx_pending, (unsigned int) TX_MAX_RING_SIZE);
-    lp->rx_ring_size = min(ering->rx_pending, (unsigned int) RX_MAX_RING_SIZE);
-
-    /* set the minimum ring size to 4, to allow the loopback test to work
-     * unchanged.
-     */
-    for (i = 2; i <= PCNET32_LOG_MAX_TX_BUFFERS; i++) {
-	if (lp->tx_ring_size <= (1 << i))
-	    break;
-    }
-    lp->tx_ring_size = (1 << i);
-    lp->tx_mod_mask = lp->tx_ring_size - 1;
-    lp->tx_len_bits = (i << 12);
-
-    for (i = 2; i <= PCNET32_LOG_MAX_RX_BUFFERS; i++) {
-	if (lp->rx_ring_size <= (1 << i))
-	    break;
-    }
-    lp->rx_ring_size = (1 << i);
-    lp->rx_mod_mask = lp->rx_ring_size - 1;
-    lp->rx_len_bits = (i << 4);
-
-    if (pcnet32_alloc_ring(dev, dev->name)) {
+	struct pcnet32_private *lp = dev->priv;
+	unsigned long flags;
+	int i;
+
+	if (ering->rx_mini_pending || ering->rx_jumbo_pending)
+		return -EINVAL;
+
+	if (netif_running(dev))
+		pcnet32_close(dev);
+
+	spin_lock_irqsave(&lp->lock, flags);
 	pcnet32_free_ring(dev);
-	spin_unlock_irqrestore(&lp->lock, flags);
-	return -ENOMEM;
-    }
+	lp->tx_ring_size =
+	    min(ering->tx_pending, (unsigned int)TX_MAX_RING_SIZE);
+	lp->rx_ring_size =
+	    min(ering->rx_pending, (unsigned int)RX_MAX_RING_SIZE);
+
+	/* set the minimum ring size to 4, to allow the loopback test to work
+	 * unchanged.
+	 */
+	for (i = 2; i <= PCNET32_LOG_MAX_TX_BUFFERS; i++) {
+		if (lp->tx_ring_size <= (1 << i))
+			break;
+	}
+	lp->tx_ring_size = (1 << i);
+	lp->tx_mod_mask = lp->tx_ring_size - 1;
+	lp->tx_len_bits = (i << 12);
 
-    spin_unlock_irqrestore(&lp->lock, flags);
+	for (i = 2; i <= PCNET32_LOG_MAX_RX_BUFFERS; i++) {
+		if (lp->rx_ring_size <= (1 << i))
+			break;
+	}
+	lp->rx_ring_size = (1 << i);
+	lp->rx_mod_mask = lp->rx_ring_size - 1;
+	lp->rx_len_bits = (i << 4);
+
+	if (pcnet32_alloc_ring(dev, dev->name)) {
+		pcnet32_free_ring(dev);
+		spin_unlock_irqrestore(&lp->lock, flags);
+		return -ENOMEM;
+	}
 
-    if (pcnet32_debug & NETIF_MSG_DRV)
-	printk(KERN_INFO PFX "%s: Ring Param Settings: RX: %d, TX: %d\n",
-	       dev->name, lp->rx_ring_size, lp->tx_ring_size);
+	spin_unlock_irqrestore(&lp->lock, flags);
 
-    if (netif_running(dev))
-	pcnet32_open(dev);
+	if (pcnet32_debug & NETIF_MSG_DRV)
+		printk(KERN_INFO PFX
+		       "%s: Ring Param Settings: RX: %d, TX: %d\n", dev->name,
+		       lp->rx_ring_size, lp->tx_ring_size);
 
-    return 0;
+	if (netif_running(dev))
+		pcnet32_open(dev);
+
+	return 0;
 }
 
-static void pcnet32_get_strings(struct net_device *dev, u32 stringset, u8 *data)
+static void pcnet32_get_strings(struct net_device *dev, u32 stringset,
+				u8 * data)
 {
-    memcpy(data, pcnet32_gstrings_test, sizeof(pcnet32_gstrings_test));
+	memcpy(data, pcnet32_gstrings_test, sizeof(pcnet32_gstrings_test));
 }
 
 static int pcnet32_self_test_count(struct net_device *dev)
 {
-    return PCNET32_TEST_LEN;
+	return PCNET32_TEST_LEN;
 }
 
 static void pcnet32_ethtool_test(struct net_device *dev,
-	struct ethtool_test *test, u64 *data)
+				 struct ethtool_test *test, u64 * data)
 {
-    struct pcnet32_private *lp = dev->priv;
-    int rc;
-
-    if (test->flags == ETH_TEST_FL_OFFLINE) {
-	rc = pcnet32_loopback_test(dev, data);
-	if (rc) {
-	    if (netif_msg_hw(lp))
-		printk(KERN_DEBUG "%s: Loopback test failed.\n", dev->name);
-	    test->flags |= ETH_TEST_FL_FAILED;
+	struct pcnet32_private *lp = dev->priv;
+	int rc;
+
+	if (test->flags == ETH_TEST_FL_OFFLINE) {
+		rc = pcnet32_loopback_test(dev, data);
+		if (rc) {
+			if (netif_msg_hw(lp))
+				printk(KERN_DEBUG "%s: Loopback test failed.\n",
+				       dev->name);
+			test->flags |= ETH_TEST_FL_FAILED;
+		} else if (netif_msg_hw(lp))
+			printk(KERN_DEBUG "%s: Loopback test passed.\n",
+			       dev->name);
 	} else if (netif_msg_hw(lp))
-	    printk(KERN_DEBUG "%s: Loopback test passed.\n", dev->name);
-    } else if (netif_msg_hw(lp))
-	printk(KERN_DEBUG "%s: No tests to run (specify 'Offline' on ethtool).",	    dev->name);
-} /* end pcnet32_ethtool_test */
+		printk(KERN_DEBUG
+		       "%s: No tests to run (specify 'Offline' on ethtool).",
+		       dev->name);
+}				/* end pcnet32_ethtool_test */
 
-static int pcnet32_loopback_test(struct net_device *dev, uint64_t *data1)
+static int pcnet32_loopback_test(struct net_device *dev, uint64_t * data1)
 {
-    struct pcnet32_private *lp = dev->priv;
-    struct pcnet32_access *a = &lp->a;	/* access to registers */
-    ulong ioaddr = dev->base_addr;	/* card base I/O address */
-    struct sk_buff *skb;		/* sk buff */
-    int x, i;				/* counters */
-    int numbuffs = 4;			/* number of TX/RX buffers and descs */
-    u16 status = 0x8300;		/* TX ring status */
-    u16 teststatus;			/* test of ring status */
-    int rc;				/* return code */
-    int size;				/* size of packets */
-    unsigned char *packet;		/* source packet data */
-    static const int data_len = 60;		/* length of source packets */
-    unsigned long flags;
-    unsigned long ticks;
-
-    *data1 = 1;			/* status of test, default to fail */
-    rc = 1;			/* default to fail */
-
-    if (netif_running(dev))
-	pcnet32_close(dev);
-
-    spin_lock_irqsave(&lp->lock, flags);
-
-    /* Reset the PCNET32 */
-    lp->a.reset (ioaddr);
-
-    /* switch pcnet32 to 32bit mode */
-    lp->a.write_bcr (ioaddr, 20, 2);
-
-    lp->init_block.mode = le16_to_cpu((lp->options & PCNET32_PORT_PORTSEL) << 7);
-    lp->init_block.filter[0] = 0;
-    lp->init_block.filter[1] = 0;
-
-    /* purge & init rings but don't actually restart */
-    pcnet32_restart(dev, 0x0000);
-
-    lp->a.write_csr(ioaddr, 0, 0x0004);	/* Set STOP bit */
-
-    /* Initialize Transmit buffers. */
-    size = data_len + 15;
-    for (x=0; x<numbuffs; x++) {
-	if (!(skb = dev_alloc_skb(size))) {
-	    if (netif_msg_hw(lp))
-		printk(KERN_DEBUG "%s: Cannot allocate skb at line: %d!\n",
-		    dev->name, __LINE__);
-	    goto clean_up;
-	} else {
-	    packet = skb->data;
-	    skb_put(skb, size);		/* create space for data */
-	    lp->tx_skbuff[x] = skb;
-	    lp->tx_ring[x].length = le16_to_cpu(-skb->len);
-	    lp->tx_ring[x].misc = 0;
-
-            /* put DA and SA into the skb */
-	    for (i=0; i<6; i++)
-		*packet++ = dev->dev_addr[i];
-	    for (i=0; i<6; i++)
-		*packet++ = dev->dev_addr[i];
-	    /* type */
-	    *packet++ = 0x08;
-	    *packet++ = 0x06;
-	    /* packet number */
-	    *packet++ = x;
-	    /* fill packet with data */
-	    for (i=0; i<data_len; i++)
-		*packet++ = i;
-
-	    lp->tx_dma_addr[x] = pci_map_single(lp->pci_dev, skb->data,
-		    skb->len, PCI_DMA_TODEVICE);
-	    lp->tx_ring[x].base = (u32)le32_to_cpu(lp->tx_dma_addr[x]);
-	    wmb(); /* Make sure owner changes after all others are visible */
-	    lp->tx_ring[x].status = le16_to_cpu(status);
-	}
-    }
-
-    x = a->read_bcr(ioaddr, 32);	/* set internal loopback in BSR32 */
-    x = x | 0x0002;
-    a->write_bcr(ioaddr, 32, x);
-
-    lp->a.write_csr (ioaddr, 15, 0x0044);	/* set int loopback in CSR15 */
-
-    teststatus = le16_to_cpu(0x8000);
-    lp->a.write_csr(ioaddr, 0, 0x0002);		/* Set STRT bit */
-
-    /* Check status of descriptors */
-    for (x=0; x<numbuffs; x++) {
-	ticks = 0;
-	rmb();
-	while ((lp->rx_ring[x].status & teststatus) && (ticks < 200)) {
-	    spin_unlock_irqrestore(&lp->lock, flags);
-	    mdelay(1);
-	    spin_lock_irqsave(&lp->lock, flags);
-	    rmb();
-	    ticks++;
-	}
-	if (ticks == 200) {
-	    if (netif_msg_hw(lp))
-		printk("%s: Desc %d failed to reset!\n",dev->name,x);
-	    break;
-	}
-    }
-
-    lp->a.write_csr(ioaddr, 0, 0x0004);		/* Set STOP bit */
-    wmb();
-    if (netif_msg_hw(lp) && netif_msg_pktdata(lp)) {
-	printk(KERN_DEBUG "%s: RX loopback packets:\n", dev->name);
-
-	for (x=0; x<numbuffs; x++) {
-	    printk(KERN_DEBUG "%s: Packet %d:\n", dev->name, x);
-	    skb = lp->rx_skbuff[x];
-	    for (i=0; i<size; i++) {
-		printk("%02x ", *(skb->data+i));
-	    }
-	    printk("\n");
-	}
-    }
-
-    x = 0;
-    rc = 0;
-    while (x<numbuffs && !rc) {
-	skb = lp->rx_skbuff[x];
-	packet = lp->tx_skbuff[x]->data;
-	for (i=0; i<size; i++) {
-	    if (*(skb->data+i) != packet[i]) {
-		if (netif_msg_hw(lp))
-		    printk(KERN_DEBUG "%s: Error in compare! %2x - %02x %02x\n",
-			    dev->name, i, *(skb->data+i), packet[i]);
-		rc = 1;
-		break;
-	    }
+	struct pcnet32_private *lp = dev->priv;
+	struct pcnet32_access *a = &lp->a;	/* access to registers */
+	ulong ioaddr = dev->base_addr;	/* card base I/O address */
+	struct sk_buff *skb;	/* sk buff */
+	int x, i;		/* counters */
+	int numbuffs = 4;	/* number of TX/RX buffers and descs */
+	u16 status = 0x8300;	/* TX ring status */
+	u16 teststatus;		/* test of ring status */
+	int rc;			/* return code */
+	int size;		/* size of packets */
+	unsigned char *packet;	/* source packet data */
+	static const int data_len = 60;	/* length of source packets */
+	unsigned long flags;
+	unsigned long ticks;
+
+	*data1 = 1;		/* status of test, default to fail */
+	rc = 1;			/* default to fail */
+
+	if (netif_running(dev))
+		pcnet32_close(dev);
+
+	spin_lock_irqsave(&lp->lock, flags);
+
+	/* Reset the PCNET32 */
+	lp->a.reset(ioaddr);
+
+	/* switch pcnet32 to 32bit mode */
+	lp->a.write_bcr(ioaddr, 20, 2);
+
+	lp->init_block.mode =
+	    le16_to_cpu((lp->options & PCNET32_PORT_PORTSEL) << 7);
+	lp->init_block.filter[0] = 0;
+	lp->init_block.filter[1] = 0;
+
+	/* purge & init rings but don't actually restart */
+	pcnet32_restart(dev, 0x0000);
+
+	lp->a.write_csr(ioaddr, 0, 0x0004);	/* Set STOP bit */
+
+	/* Initialize Transmit buffers. */
+	size = data_len + 15;
+	for (x = 0; x < numbuffs; x++) {
+		if (!(skb = dev_alloc_skb(size))) {
+			if (netif_msg_hw(lp))
+				printk(KERN_DEBUG
+				       "%s: Cannot allocate skb at line: %d!\n",
+				       dev->name, __LINE__);
+			goto clean_up;
+		} else {
+			packet = skb->data;
+			skb_put(skb, size);	/* create space for data */
+			lp->tx_skbuff[x] = skb;
+			lp->tx_ring[x].length = le16_to_cpu(-skb->len);
+			lp->tx_ring[x].misc = 0;
+
+			/* put DA and SA into the skb */
+			for (i = 0; i < 6; i++)
+				*packet++ = dev->dev_addr[i];
+			for (i = 0; i < 6; i++)
+				*packet++ = dev->dev_addr[i];
+			/* type */
+			*packet++ = 0x08;
+			*packet++ = 0x06;
+			/* packet number */
+			*packet++ = x;
+			/* fill packet with data */
+			for (i = 0; i < data_len; i++)
+				*packet++ = i;
+
+			lp->tx_dma_addr[x] =
+			    pci_map_single(lp->pci_dev, skb->data, skb->len,
+					   PCI_DMA_TODEVICE);
+			lp->tx_ring[x].base =
+			    (u32) le32_to_cpu(lp->tx_dma_addr[x]);
+			wmb();	/* Make sure owner changes after all others are visible */
+			lp->tx_ring[x].status = le16_to_cpu(status);
+		}
+	}
+
+	x = a->read_bcr(ioaddr, 32);	/* set internal loopback in BSR32 */
+	x = x | 0x0002;
+	a->write_bcr(ioaddr, 32, x);
+
+	lp->a.write_csr(ioaddr, 15, 0x0044);	/* set int loopback in CSR15 */
+
+	teststatus = le16_to_cpu(0x8000);
+	lp->a.write_csr(ioaddr, 0, 0x0002);	/* Set STRT bit */
+
+	/* Check status of descriptors */
+	for (x = 0; x < numbuffs; x++) {
+		ticks = 0;
+		rmb();
+		while ((lp->rx_ring[x].status & teststatus) && (ticks < 200)) {
+			spin_unlock_irqrestore(&lp->lock, flags);
+			mdelay(1);
+			spin_lock_irqsave(&lp->lock, flags);
+			rmb();
+			ticks++;
+		}
+		if (ticks == 200) {
+			if (netif_msg_hw(lp))
+				printk("%s: Desc %d failed to reset!\n",
+				       dev->name, x);
+			break;
+		}
+	}
+
+	lp->a.write_csr(ioaddr, 0, 0x0004);	/* Set STOP bit */
+	wmb();
+	if (netif_msg_hw(lp) && netif_msg_pktdata(lp)) {
+		printk(KERN_DEBUG "%s: RX loopback packets:\n", dev->name);
+
+		for (x = 0; x < numbuffs; x++) {
+			printk(KERN_DEBUG "%s: Packet %d:\n", dev->name, x);
+			skb = lp->rx_skbuff[x];
+			for (i = 0; i < size; i++) {
+				printk("%02x ", *(skb->data + i));
+			}
+			printk("\n");
+		}
+	}
+
+	x = 0;
+	rc = 0;
+	while (x < numbuffs && !rc) {
+		skb = lp->rx_skbuff[x];
+		packet = lp->tx_skbuff[x]->data;
+		for (i = 0; i < size; i++) {
+			if (*(skb->data + i) != packet[i]) {
+				if (netif_msg_hw(lp))
+					printk(KERN_DEBUG
+					       "%s: Error in compare! %2x - %02x %02x\n",
+					       dev->name, i, *(skb->data + i),
+					       packet[i]);
+				rc = 1;
+				break;
+			}
+		}
+		x++;
+	}
+	if (!rc) {
+		*data1 = 0;
 	}
-	x++;
-    }
-    if (!rc) {
-	*data1 = 0;
-    }
 
-clean_up:
-    pcnet32_purge_tx_ring(dev);
-    x = a->read_csr(ioaddr, 15) & 0xFFFF;
-    a->write_csr(ioaddr, 15, (x & ~0x0044));	/* reset bits 6 and 2 */
+      clean_up:
+	pcnet32_purge_tx_ring(dev);
+	x = a->read_csr(ioaddr, 15) & 0xFFFF;
+	a->write_csr(ioaddr, 15, (x & ~0x0044));	/* reset bits 6 and 2 */
 
-    x = a->read_bcr(ioaddr, 32);		/* reset internal loopback */
-    x = x & ~0x0002;
-    a->write_bcr(ioaddr, 32, x);
+	x = a->read_bcr(ioaddr, 32);	/* reset internal loopback */
+	x = x & ~0x0002;
+	a->write_bcr(ioaddr, 32, x);
 
-    spin_unlock_irqrestore(&lp->lock, flags);
+	spin_unlock_irqrestore(&lp->lock, flags);
 
-    if (netif_running(dev)) {
-	pcnet32_open(dev);
-    } else {
-	lp->a.write_bcr (ioaddr, 20, 4);	/* return to 16bit mode */
-    }
+	if (netif_running(dev)) {
+		pcnet32_open(dev);
+	} else {
+		lp->a.write_bcr(ioaddr, 20, 4);	/* return to 16bit mode */
+	}
 
-    return(rc);
-} /* end pcnet32_loopback_test  */
+	return (rc);
+}				/* end pcnet32_loopback_test  */
 
 static void pcnet32_led_blink_callback(struct net_device *dev)
 {
-    struct pcnet32_private *lp = dev->priv;
-    struct pcnet32_access *a = &lp->a;
-    ulong ioaddr = dev->base_addr;
-    unsigned long flags;
-    int i;
-
-    spin_lock_irqsave(&lp->lock, flags);
-    for (i=4; i<8; i++) {
-	a->write_bcr(ioaddr, i, a->read_bcr(ioaddr, i) ^ 0x4000);
-    }
-    spin_unlock_irqrestore(&lp->lock, flags);
-
-    mod_timer(&lp->blink_timer, PCNET32_BLINK_TIMEOUT);
+	struct pcnet32_private *lp = dev->priv;
+	struct pcnet32_access *a = &lp->a;
+	ulong ioaddr = dev->base_addr;
+	unsigned long flags;
+	int i;
+
+	spin_lock_irqsave(&lp->lock, flags);
+	for (i = 4; i < 8; i++) {
+		a->write_bcr(ioaddr, i, a->read_bcr(ioaddr, i) ^ 0x4000);
+	}
+	spin_unlock_irqrestore(&lp->lock, flags);
+
+	mod_timer(&lp->blink_timer, PCNET32_BLINK_TIMEOUT);
 }
 
 static int pcnet32_phys_id(struct net_device *dev, u32 data)
 {
-    struct pcnet32_private *lp = dev->priv;
-    struct pcnet32_access *a = &lp->a;
-    ulong ioaddr = dev->base_addr;
-    unsigned long flags;
-    int i, regs[4];
-
-    if (!lp->blink_timer.function) {
-	init_timer(&lp->blink_timer);
-	lp->blink_timer.function = (void *) pcnet32_led_blink_callback;
-	lp->blink_timer.data = (unsigned long) dev;
-    }
-
-    /* Save the current value of the bcrs */
-    spin_lock_irqsave(&lp->lock, flags);
-    for (i=4; i<8; i++) {
-	regs[i-4] = a->read_bcr(ioaddr, i);
-    }
-    spin_unlock_irqrestore(&lp->lock, flags);
-
-    mod_timer(&lp->blink_timer, jiffies);
-    set_current_state(TASK_INTERRUPTIBLE);
-
-    if ((!data) || (data > (u32)(MAX_SCHEDULE_TIMEOUT / HZ)))
-    data = (u32)(MAX_SCHEDULE_TIMEOUT / HZ);
-
-    msleep_interruptible(data * 1000);
-    del_timer_sync(&lp->blink_timer);
-
-    /* Restore the original value of the bcrs */
-    spin_lock_irqsave(&lp->lock, flags);
-    for (i=4; i<8; i++) {
-	a->write_bcr(ioaddr, i, regs[i-4]);
-    }
-    spin_unlock_irqrestore(&lp->lock, flags);
-
-    return 0;
+	struct pcnet32_private *lp = dev->priv;
+	struct pcnet32_access *a = &lp->a;
+	ulong ioaddr = dev->base_addr;
+	unsigned long flags;
+	int i, regs[4];
+
+	if (!lp->blink_timer.function) {
+		init_timer(&lp->blink_timer);
+		lp->blink_timer.function = (void *)pcnet32_led_blink_callback;
+		lp->blink_timer.data = (unsigned long)dev;
+	}
+
+	/* Save the current value of the bcrs */
+	spin_lock_irqsave(&lp->lock, flags);
+	for (i = 4; i < 8; i++) {
+		regs[i - 4] = a->read_bcr(ioaddr, i);
+	}
+	spin_unlock_irqrestore(&lp->lock, flags);
+
+	mod_timer(&lp->blink_timer, jiffies);
+	set_current_state(TASK_INTERRUPTIBLE);
+
+	if ((!data) || (data > (u32) (MAX_SCHEDULE_TIMEOUT / HZ)))
+		data = (u32) (MAX_SCHEDULE_TIMEOUT / HZ);
+
+	msleep_interruptible(data * 1000);
+	del_timer_sync(&lp->blink_timer);
+
+	/* Restore the original value of the bcrs */
+	spin_lock_irqsave(&lp->lock, flags);
+	for (i = 4; i < 8; i++) {
+		a->write_bcr(ioaddr, i, regs[i - 4]);
+	}
+	spin_unlock_irqrestore(&lp->lock, flags);
+
+	return 0;
 }
 
+#define PCNET32_REGS_PER_PHY	32
+#define PCNET32_MAX_PHYS	32
 static int pcnet32_get_regs_len(struct net_device *dev)
 {
-    return(PCNET32_NUM_REGS * sizeof(u16));
+	struct pcnet32_private *lp = dev->priv;
+	int j = lp->phycount * PCNET32_REGS_PER_PHY;
+
+	return ((PCNET32_NUM_REGS + j) * sizeof(u16));
 }
 
 static void pcnet32_get_regs(struct net_device *dev, struct ethtool_regs *regs,
-	void *ptr)
+			     void *ptr)
 {
-    int i, csr0;
-    u16 *buff = ptr;
-    struct pcnet32_private *lp = dev->priv;
-    struct pcnet32_access *a = &lp->a;
-    ulong ioaddr = dev->base_addr;
-    int ticks;
-    unsigned long flags;
-
-    spin_lock_irqsave(&lp->lock, flags);
-
-    csr0 = a->read_csr(ioaddr, 0);
-    if (!(csr0 & 0x0004)) {	/* If not stopped */
-	/* set SUSPEND (SPND) - CSR5 bit 0 */
-	a->write_csr(ioaddr, 5, 0x0001);
-
-	/* poll waiting for bit to be set */
-	ticks = 0;
-	while (!(a->read_csr(ioaddr, 5) & 0x0001)) {
-	    spin_unlock_irqrestore(&lp->lock, flags);
-	    mdelay(1);
-	    spin_lock_irqsave(&lp->lock, flags);
-	    ticks++;
-	    if (ticks > 200) {
-		if (netif_msg_hw(lp))
-		    printk(KERN_DEBUG "%s: Error getting into suspend!\n",
-			    dev->name);
-		break;
-	    }
-	}
-    }
+	int i, csr0;
+	u16 *buff = ptr;
+	struct pcnet32_private *lp = dev->priv;
+	struct pcnet32_access *a = &lp->a;
+	ulong ioaddr = dev->base_addr;
+	int ticks;
+	unsigned long flags;
 
-    /* read address PROM */
-    for (i=0; i<16; i += 2)
-	*buff++ = inw(ioaddr + i);
+	spin_lock_irqsave(&lp->lock, flags);
 
-    /* read control and status registers */
-    for (i=0; i<90; i++) {
-	*buff++ = a->read_csr(ioaddr, i);
-    }
+	csr0 = a->read_csr(ioaddr, 0);
+	if (!(csr0 & 0x0004)) {	/* If not stopped */
+		/* set SUSPEND (SPND) - CSR5 bit 0 */
+		a->write_csr(ioaddr, 5, 0x0001);
+
+		/* poll waiting for bit to be set */
+		ticks = 0;
+		while (!(a->read_csr(ioaddr, 5) & 0x0001)) {
+			spin_unlock_irqrestore(&lp->lock, flags);
+			mdelay(1);
+			spin_lock_irqsave(&lp->lock, flags);
+			ticks++;
+			if (ticks > 200) {
+				if (netif_msg_hw(lp))
+					printk(KERN_DEBUG
+					       "%s: Error getting into suspend!\n",
+					       dev->name);
+				break;
+			}
+		}
+	}
 
-    *buff++ = a->read_csr(ioaddr, 112);
-    *buff++ = a->read_csr(ioaddr, 114);
+	/* read address PROM */
+	for (i = 0; i < 16; i += 2)
+		*buff++ = inw(ioaddr + i);
 
-    /* read bus configuration registers */
-    for (i=0; i<30; i++) {
-	*buff++ = a->read_bcr(ioaddr, i);
-    }
-    *buff++ = 0;	/* skip bcr30 so as not to hang 79C976 */
-    for (i=31; i<36; i++) {
-	*buff++ = a->read_bcr(ioaddr, i);
-    }
+	/* read control and status registers */
+	for (i = 0; i < 90; i++) {
+		*buff++ = a->read_csr(ioaddr, i);
+	}
+
+	*buff++ = a->read_csr(ioaddr, 112);
+	*buff++ = a->read_csr(ioaddr, 114);
 
-    /* read mii phy registers */
-    if (lp->mii) {
-	for (i=0; i<32; i++) {
-	    lp->a.write_bcr(ioaddr, 33, ((lp->mii_if.phy_id) << 5) | i);
-	    *buff++ = lp->a.read_bcr(ioaddr, 34);
+	/* read bus configuration registers */
+	for (i = 0; i < 30; i++) {
+		*buff++ = a->read_bcr(ioaddr, i);
+	}
+	*buff++ = 0;		/* skip bcr30 so as not to hang 79C976 */
+	for (i = 31; i < 36; i++) {
+		*buff++ = a->read_bcr(ioaddr, i);
 	}
-    }
 
-    if (!(csr0 & 0x0004)) {	/* If not stopped */
-	/* clear SUSPEND (SPND) - CSR5 bit 0 */
-	a->write_csr(ioaddr, 5, 0x0000);
-    }
+	/* read mii phy registers */
+	if (lp->mii) {
+		int j;
+		for (j = 0; j < PCNET32_MAX_PHYS; j++) {
+			if (lp->phymask & (1 << j)) {
+				for (i = 0; i < PCNET32_REGS_PER_PHY; i++) {
+					lp->a.write_bcr(ioaddr, 33,
+							(j << 5) | i);
+					*buff++ = lp->a.read_bcr(ioaddr, 34);
+				}
+			}
+		}
+	}
 
-    i = buff - (u16 *)ptr;
-    for (; i < PCNET32_NUM_REGS; i++)
-	*buff++ = 0;
+	if (!(csr0 & 0x0004)) {	/* If not stopped */
+		/* clear SUSPEND (SPND) - CSR5 bit 0 */
+		a->write_csr(ioaddr, 5, 0x0000);
+	}
 
-    spin_unlock_irqrestore(&lp->lock, flags);
+	spin_unlock_irqrestore(&lp->lock, flags);
 }
 
 static struct ethtool_ops pcnet32_ethtool_ops = {
-    .get_settings	= pcnet32_get_settings,
-    .set_settings	= pcnet32_set_settings,
-    .get_drvinfo	= pcnet32_get_drvinfo,
-    .get_msglevel	= pcnet32_get_msglevel,
-    .set_msglevel	= pcnet32_set_msglevel,
-    .nway_reset		= pcnet32_nway_reset,
-    .get_link		= pcnet32_get_link,
-    .get_ringparam	= pcnet32_get_ringparam,
-    .set_ringparam	= pcnet32_set_ringparam,
-    .get_tx_csum	= ethtool_op_get_tx_csum,
-    .get_sg		= ethtool_op_get_sg,
-    .get_tso		= ethtool_op_get_tso,
-    .get_strings	= pcnet32_get_strings,
-    .self_test_count	= pcnet32_self_test_count,
-    .self_test		= pcnet32_ethtool_test,
-    .phys_id		= pcnet32_phys_id,
-    .get_regs_len	= pcnet32_get_regs_len,
-    .get_regs		= pcnet32_get_regs,
-    .get_perm_addr	= ethtool_op_get_perm_addr,
+	.get_settings		= pcnet32_get_settings,
+	.set_settings		= pcnet32_set_settings,
+	.get_drvinfo		= pcnet32_get_drvinfo,
+	.get_msglevel		= pcnet32_get_msglevel,
+	.set_msglevel		= pcnet32_set_msglevel,
+	.nway_reset		= pcnet32_nway_reset,
+	.get_link		= pcnet32_get_link,
+	.get_ringparam		= pcnet32_get_ringparam,
+	.set_ringparam		= pcnet32_set_ringparam,
+	.get_tx_csum		= ethtool_op_get_tx_csum,
+	.get_sg			= ethtool_op_get_sg,
+	.get_tso		= ethtool_op_get_tso,
+	.get_strings		= pcnet32_get_strings,
+	.self_test_count	= pcnet32_self_test_count,
+	.self_test		= pcnet32_ethtool_test,
+	.phys_id		= pcnet32_phys_id,
+	.get_regs_len		= pcnet32_get_regs_len,
+	.get_regs		= pcnet32_get_regs,
+	.get_perm_addr		= ethtool_op_get_perm_addr,
 };
 
 /* only probes for non-PCI devices, the rest are handled by
  * pci_register_driver via pcnet32_probe_pci */
 
-static void __devinit
-pcnet32_probe_vlbus(void)
+static void __devinit pcnet32_probe_vlbus(void)
 {
-    unsigned int *port, ioaddr;
-
-    /* search for PCnet32 VLB cards at known addresses */
-    for (port = pcnet32_portlist; (ioaddr = *port); port++) {
-	if (request_region(ioaddr, PCNET32_TOTAL_SIZE, "pcnet32_probe_vlbus")) {
-	    /* check if there is really a pcnet chip on that ioaddr */
-	    if ((inb(ioaddr + 14) == 0x57) && (inb(ioaddr + 15) == 0x57)) {
-		pcnet32_probe1(ioaddr, 0, NULL);
-	    } else {
-		release_region(ioaddr, PCNET32_TOTAL_SIZE);
-	    }
-	}
-    }
+	unsigned int *port, ioaddr;
+
+	/* search for PCnet32 VLB cards at known addresses */
+	for (port = pcnet32_portlist; (ioaddr = *port); port++) {
+		if (request_region
+		    (ioaddr, PCNET32_TOTAL_SIZE, "pcnet32_probe_vlbus")) {
+			/* check if there is really a pcnet chip on that ioaddr */
+			if ((inb(ioaddr + 14) == 0x57)
+			    && (inb(ioaddr + 15) == 0x57)) {
+				pcnet32_probe1(ioaddr, 0, NULL);
+			} else {
+				release_region(ioaddr, PCNET32_TOTAL_SIZE);
+			}
+		}
+	}
 }
 
-
 static int __devinit
 pcnet32_probe_pci(struct pci_dev *pdev, const struct pci_device_id *ent)
 {
-    unsigned long ioaddr;
-    int err;
-
-    err = pci_enable_device(pdev);
-    if (err < 0) {
-	if (pcnet32_debug & NETIF_MSG_PROBE)
-	    printk(KERN_ERR PFX "failed to enable device -- err=%d\n", err);
-	return err;
-    }
-    pci_set_master(pdev);
+	unsigned long ioaddr;
+	int err;
+
+	err = pci_enable_device(pdev);
+	if (err < 0) {
+		if (pcnet32_debug & NETIF_MSG_PROBE)
+			printk(KERN_ERR PFX
+			       "failed to enable device -- err=%d\n", err);
+		return err;
+	}
+	pci_set_master(pdev);
+
+	ioaddr = pci_resource_start(pdev, 0);
+	if (!ioaddr) {
+		if (pcnet32_debug & NETIF_MSG_PROBE)
+			printk(KERN_ERR PFX
+			       "card has no PCI IO resources, aborting\n");
+		return -ENODEV;
+	}
 
-    ioaddr = pci_resource_start (pdev, 0);
-    if (!ioaddr) {
-	if (pcnet32_debug & NETIF_MSG_PROBE)
-	    printk (KERN_ERR PFX "card has no PCI IO resources, aborting\n");
-	return -ENODEV;
-    }
+	if (!pci_dma_supported(pdev, PCNET32_DMA_MASK)) {
+		if (pcnet32_debug & NETIF_MSG_PROBE)
+			printk(KERN_ERR PFX
+			       "architecture does not support 32bit PCI busmaster DMA\n");
+		return -ENODEV;
+	}
+	if (request_region(ioaddr, PCNET32_TOTAL_SIZE, "pcnet32_probe_pci") ==
+	    NULL) {
+		if (pcnet32_debug & NETIF_MSG_PROBE)
+			printk(KERN_ERR PFX
+			       "io address range already allocated\n");
+		return -EBUSY;
+	}
 
-    if (!pci_dma_supported(pdev, PCNET32_DMA_MASK)) {
-	if (pcnet32_debug & NETIF_MSG_PROBE)
-	    printk(KERN_ERR PFX "architecture does not support 32bit PCI busmaster DMA\n");
-	return -ENODEV;
-    }
-    if (request_region(ioaddr, PCNET32_TOTAL_SIZE, "pcnet32_probe_pci") == NULL) {
-	if (pcnet32_debug & NETIF_MSG_PROBE)
-	    printk(KERN_ERR PFX "io address range already allocated\n");
-	return -EBUSY;
-    }
-
-    err =  pcnet32_probe1(ioaddr, 1, pdev);
-    if (err < 0) {
-	pci_disable_device(pdev);
-    }
-    return err;
+	err = pcnet32_probe1(ioaddr, 1, pdev);
+	if (err < 0) {
+		pci_disable_device(pdev);
+	}
+	return err;
 }
 
-
 /* pcnet32_probe1
  *  Called from both pcnet32_probe_vlbus and pcnet_probe_pci.
  *  pdev will be NULL when called from pcnet32_probe_vlbus.
@@ -1107,630 +1028,764 @@ pcnet32_probe_pci(struct pci_dev *pdev, const struct pci_device_id *ent)
 static int __devinit
 pcnet32_probe1(unsigned long ioaddr, int shared, struct pci_dev *pdev)
 {
-    struct pcnet32_private *lp;
-    dma_addr_t lp_dma_addr;
-    int i, media;
-    int fdx, mii, fset, dxsuflo;
-    int chip_version;
-    char *chipname;
-    struct net_device *dev;
-    struct pcnet32_access *a = NULL;
-    u8 promaddr[6];
-    int ret = -ENODEV;
-
-    /* reset the chip */
-    pcnet32_wio_reset(ioaddr);
-
-    /* NOTE: 16-bit check is first, otherwise some older PCnet chips fail */
-    if (pcnet32_wio_read_csr(ioaddr, 0) == 4 && pcnet32_wio_check(ioaddr)) {
-	a = &pcnet32_wio;
-    } else {
-	pcnet32_dwio_reset(ioaddr);
-	if (pcnet32_dwio_read_csr(ioaddr, 0) == 4 && pcnet32_dwio_check(ioaddr)) {
-	    a = &pcnet32_dwio;
-	} else
-	    goto err_release_region;
-    }
-
-    chip_version = a->read_csr(ioaddr, 88) | (a->read_csr(ioaddr,89) << 16);
-    if ((pcnet32_debug & NETIF_MSG_PROBE) && (pcnet32_debug & NETIF_MSG_HW))
-	printk(KERN_INFO "  PCnet chip version is %#x.\n", chip_version);
-    if ((chip_version & 0xfff) != 0x003) {
-	if (pcnet32_debug & NETIF_MSG_PROBE)
-	    printk(KERN_INFO PFX "Unsupported chip version.\n");
-	goto err_release_region;
-    }
-
-    /* initialize variables */
-    fdx = mii = fset = dxsuflo = 0;
-    chip_version = (chip_version >> 12) & 0xffff;
-
-    switch (chip_version) {
-    case 0x2420:
-	chipname = "PCnet/PCI 79C970"; /* PCI */
-	break;
-    case 0x2430:
-	if (shared)
-	    chipname = "PCnet/PCI 79C970"; /* 970 gives the wrong chip id back */
-	else
-	    chipname = "PCnet/32 79C965"; /* 486/VL bus */
-	break;
-    case 0x2621:
-	chipname = "PCnet/PCI II 79C970A"; /* PCI */
-	fdx = 1;
-	break;
-    case 0x2623:
-	chipname = "PCnet/FAST 79C971"; /* PCI */
-	fdx = 1; mii = 1; fset = 1;
-	break;
-    case 0x2624:
-	chipname = "PCnet/FAST+ 79C972"; /* PCI */
-	fdx = 1; mii = 1; fset = 1;
-	break;
-    case 0x2625:
-	chipname = "PCnet/FAST III 79C973"; /* PCI */
-	fdx = 1; mii = 1;
-	break;
-    case 0x2626:
-	chipname = "PCnet/Home 79C978"; /* PCI */
-	fdx = 1;
+	struct pcnet32_private *lp;
+	dma_addr_t lp_dma_addr;
+	int i, media;
+	int fdx, mii, fset, dxsuflo;
+	int chip_version;
+	char *chipname;
+	struct net_device *dev;
+	struct pcnet32_access *a = NULL;
+	u8 promaddr[6];
+	int ret = -ENODEV;
+
+	/* reset the chip */
+	pcnet32_wio_reset(ioaddr);
+
+	/* NOTE: 16-bit check is first, otherwise some older PCnet chips fail */
+	if (pcnet32_wio_read_csr(ioaddr, 0) == 4 && pcnet32_wio_check(ioaddr)) {
+		a = &pcnet32_wio;
+	} else {
+		pcnet32_dwio_reset(ioaddr);
+		if (pcnet32_dwio_read_csr(ioaddr, 0) == 4
+		    && pcnet32_dwio_check(ioaddr)) {
+			a = &pcnet32_dwio;
+		} else
+			goto err_release_region;
+	}
+
+	chip_version =
+	    a->read_csr(ioaddr, 88) | (a->read_csr(ioaddr, 89) << 16);
+	if ((pcnet32_debug & NETIF_MSG_PROBE) && (pcnet32_debug & NETIF_MSG_HW))
+		printk(KERN_INFO "  PCnet chip version is %#x.\n",
+		       chip_version);
+	if ((chip_version & 0xfff) != 0x003) {
+		if (pcnet32_debug & NETIF_MSG_PROBE)
+			printk(KERN_INFO PFX "Unsupported chip version.\n");
+		goto err_release_region;
+	}
+
+	/* initialize variables */
+	fdx = mii = fset = dxsuflo = 0;
+	chip_version = (chip_version >> 12) & 0xffff;
+
+	switch (chip_version) {
+	case 0x2420:
+		chipname = "PCnet/PCI 79C970";	/* PCI */
+		break;
+	case 0x2430:
+		if (shared)
+			chipname = "PCnet/PCI 79C970";	/* 970 gives the wrong chip id back */
+		else
+			chipname = "PCnet/32 79C965";	/* 486/VL bus */
+		break;
+	case 0x2621:
+		chipname = "PCnet/PCI II 79C970A";	/* PCI */
+		fdx = 1;
+		break;
+	case 0x2623:
+		chipname = "PCnet/FAST 79C971";	/* PCI */
+		fdx = 1;
+		mii = 1;
+		fset = 1;
+		break;
+	case 0x2624:
+		chipname = "PCnet/FAST+ 79C972";	/* PCI */
+		fdx = 1;
+		mii = 1;
+		fset = 1;
+		break;
+	case 0x2625:
+		chipname = "PCnet/FAST III 79C973";	/* PCI */
+		fdx = 1;
+		mii = 1;
+		break;
+	case 0x2626:
+		chipname = "PCnet/Home 79C978";	/* PCI */
+		fdx = 1;
+		/*
+		 * This is based on specs published at www.amd.com.  This section
+		 * assumes that a card with a 79C978 wants to go into standard
+		 * ethernet mode.  The 79C978 can also go into 1Mb HomePNA mode,
+		 * and the module option homepna=1 can select this instead.
+		 */
+		media = a->read_bcr(ioaddr, 49);
+		media &= ~3;	/* default to 10Mb ethernet */
+		if (cards_found < MAX_UNITS && homepna[cards_found])
+			media |= 1;	/* switch to home wiring mode */
+		if (pcnet32_debug & NETIF_MSG_PROBE)
+			printk(KERN_DEBUG PFX "media set to %sMbit mode.\n",
+			       (media & 1) ? "1" : "10");
+		a->write_bcr(ioaddr, 49, media);
+		break;
+	case 0x2627:
+		chipname = "PCnet/FAST III 79C975";	/* PCI */
+		fdx = 1;
+		mii = 1;
+		break;
+	case 0x2628:
+		chipname = "PCnet/PRO 79C976";
+		fdx = 1;
+		mii = 1;
+		break;
+	default:
+		if (pcnet32_debug & NETIF_MSG_PROBE)
+			printk(KERN_INFO PFX
+			       "PCnet version %#x, no PCnet32 chip.\n",
+			       chip_version);
+		goto err_release_region;
+	}
+
 	/*
-	 * This is based on specs published at www.amd.com.  This section
-	 * assumes that a card with a 79C978 wants to go into standard
-	 * ethernet mode.  The 79C978 can also go into 1Mb HomePNA mode,
-	 * and the module option homepna=1 can select this instead.
+	 *  On selected chips turn on the BCR18:NOUFLO bit. This stops transmit
+	 *  starting until the packet is loaded. Strike one for reliability, lose
+	 *  one for latency - although on PCI this isnt a big loss. Older chips
+	 *  have FIFO's smaller than a packet, so you can't do this.
+	 *  Turn on BCR18:BurstRdEn and BCR18:BurstWrEn.
 	 */
-	media = a->read_bcr(ioaddr, 49);
-	media &= ~3;		/* default to 10Mb ethernet */
-	if (cards_found < MAX_UNITS && homepna[cards_found])
-	    media |= 1; 	/* switch to home wiring mode */
-	if (pcnet32_debug & NETIF_MSG_PROBE)
-	    printk(KERN_DEBUG PFX "media set to %sMbit mode.\n", 
-		    (media & 1) ? "1" : "10");
-	a->write_bcr(ioaddr, 49, media);
-	break;
-    case 0x2627:
-	chipname = "PCnet/FAST III 79C975"; /* PCI */
-	fdx = 1; mii = 1;
-	break;
-    case 0x2628:
-	chipname = "PCnet/PRO 79C976";
-	fdx = 1; mii = 1;
-	break;
-    default:
-	if (pcnet32_debug & NETIF_MSG_PROBE)
-	    printk(KERN_INFO PFX "PCnet version %#x, no PCnet32 chip.\n",
-		    chip_version);
-	goto err_release_region;
-    }
-
-    /*
-     *	On selected chips turn on the BCR18:NOUFLO bit. This stops transmit
-     *	starting until the packet is loaded. Strike one for reliability, lose
-     *	one for latency - although on PCI this isnt a big loss. Older chips
-     *	have FIFO's smaller than a packet, so you can't do this.
-     *	Turn on BCR18:BurstRdEn and BCR18:BurstWrEn.
-     */
-
-    if (fset) {
-	a->write_bcr(ioaddr, 18, (a->read_bcr(ioaddr, 18) | 0x0860));
-	a->write_csr(ioaddr, 80, (a->read_csr(ioaddr, 80) & 0x0C00) | 0x0c00);
-	dxsuflo = 1;
-    }
-
-    dev = alloc_etherdev(0);
-    if (!dev) {
+
+	if (fset) {
+		a->write_bcr(ioaddr, 18, (a->read_bcr(ioaddr, 18) | 0x0860));
+		a->write_csr(ioaddr, 80,
+			     (a->read_csr(ioaddr, 80) & 0x0C00) | 0x0c00);
+		dxsuflo = 1;
+	}
+
+	dev = alloc_etherdev(0);
+	if (!dev) {
+		if (pcnet32_debug & NETIF_MSG_PROBE)
+			printk(KERN_ERR PFX "Memory allocation failed.\n");
+		ret = -ENOMEM;
+		goto err_release_region;
+	}
+	SET_NETDEV_DEV(dev, &pdev->dev);
+
 	if (pcnet32_debug & NETIF_MSG_PROBE)
-	    printk(KERN_ERR PFX "Memory allocation failed.\n");
-	ret = -ENOMEM;
-	goto err_release_region;
-    }
-    SET_NETDEV_DEV(dev, &pdev->dev);
-
-    if (pcnet32_debug & NETIF_MSG_PROBE)
-	printk(KERN_INFO PFX "%s at %#3lx,", chipname, ioaddr);
-
-    /* In most chips, after a chip reset, the ethernet address is read from the
-     * station address PROM at the base address and programmed into the
-     * "Physical Address Registers" CSR12-14.
-     * As a precautionary measure, we read the PROM values and complain if
-     * they disagree with the CSRs.  Either way, we use the CSR values, and
-     * double check that they are valid.
-     */
-    for (i = 0; i < 3; i++) {
-	unsigned int val;
-	val = a->read_csr(ioaddr, i+12) & 0x0ffff;
-	/* There may be endianness issues here. */
-	dev->dev_addr[2*i] = val & 0x0ff;
-	dev->dev_addr[2*i+1] = (val >> 8) & 0x0ff;
-    }
-
-    /* read PROM address and compare with CSR address */
-    for (i = 0; i < 6; i++)
-	promaddr[i] = inb(ioaddr + i);
-
-    if (memcmp(promaddr, dev->dev_addr, 6)
-	|| !is_valid_ether_addr(dev->dev_addr)) {
-	if (is_valid_ether_addr(promaddr)) {
-	    if (pcnet32_debug & NETIF_MSG_PROBE) {
-		printk(" warning: CSR address invalid,\n");
-		printk(KERN_INFO "    using instead PROM address of");
-	    }
-	    memcpy(dev->dev_addr, promaddr, 6);
-	}
-    }
-    memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
-
-    /* if the ethernet address is not valid, force to 00:00:00:00:00:00 */
-    if (!is_valid_ether_addr(dev->perm_addr))
-	memset(dev->dev_addr, 0, sizeof(dev->dev_addr));
-
-    if (pcnet32_debug & NETIF_MSG_PROBE) {
+		printk(KERN_INFO PFX "%s at %#3lx,", chipname, ioaddr);
+
+	/* In most chips, after a chip reset, the ethernet address is read from the
+	 * station address PROM at the base address and programmed into the
+	 * "Physical Address Registers" CSR12-14.
+	 * As a precautionary measure, we read the PROM values and complain if
+	 * they disagree with the CSRs.  Either way, we use the CSR values, and
+	 * double check that they are valid.
+	 */
+	for (i = 0; i < 3; i++) {
+		unsigned int val;
+		val = a->read_csr(ioaddr, i + 12) & 0x0ffff;
+		/* There may be endianness issues here. */
+		dev->dev_addr[2 * i] = val & 0x0ff;
+		dev->dev_addr[2 * i + 1] = (val >> 8) & 0x0ff;
+	}
+
+	/* read PROM address and compare with CSR address */
 	for (i = 0; i < 6; i++)
-	    printk(" %2.2x", dev->dev_addr[i]);
-
-	/* Version 0x2623 and 0x2624 */
-	if (((chip_version + 1) & 0xfffe) == 0x2624) {
-	    i = a->read_csr(ioaddr, 80) & 0x0C00;  /* Check tx_start_pt */
-	    printk("\n" KERN_INFO "    tx_start_pt(0x%04x):",i);
-	    switch(i>>10) {
-		case 0: printk("  20 bytes,"); break;
-		case 1: printk("  64 bytes,"); break;
-		case 2: printk(" 128 bytes,"); break;
-		case 3: printk("~220 bytes,"); break;
-	    }
-	    i = a->read_bcr(ioaddr, 18);  /* Check Burst/Bus control */
-	    printk(" BCR18(%x):",i&0xffff);
-	    if (i & (1<<5)) printk("BurstWrEn ");
-	    if (i & (1<<6)) printk("BurstRdEn ");
-	    if (i & (1<<7)) printk("DWordIO ");
-	    if (i & (1<<11)) printk("NoUFlow ");
-	    i = a->read_bcr(ioaddr, 25);
-	    printk("\n" KERN_INFO "    SRAMSIZE=0x%04x,",i<<8);
-	    i = a->read_bcr(ioaddr, 26);
-	    printk(" SRAM_BND=0x%04x,",i<<8);
-	    i = a->read_bcr(ioaddr, 27);
-	    if (i & (1<<14)) printk("LowLatRx");
-	}
-    }
-
-    dev->base_addr = ioaddr;
-    /* pci_alloc_consistent returns page-aligned memory, so we do not have to check the alignment */
-    if ((lp = pci_alloc_consistent(pdev, sizeof(*lp), &lp_dma_addr)) == NULL) {
-	if (pcnet32_debug & NETIF_MSG_PROBE)
-	    printk(KERN_ERR PFX "Consistent memory allocation failed.\n");
-	ret = -ENOMEM;
-	goto err_free_netdev;
-    }
-
-    memset(lp, 0, sizeof(*lp));
-    lp->dma_addr = lp_dma_addr;
-    lp->pci_dev = pdev;
-
-    spin_lock_init(&lp->lock);
-
-    SET_MODULE_OWNER(dev);
-    SET_NETDEV_DEV(dev, &pdev->dev);
-    dev->priv = lp;
-    lp->name = chipname;
-    lp->shared_irq = shared;
-    lp->tx_ring_size = TX_RING_SIZE;		/* default tx ring size */
-    lp->rx_ring_size = RX_RING_SIZE;		/* default rx ring size */
-    lp->tx_mod_mask = lp->tx_ring_size - 1;
-    lp->rx_mod_mask = lp->rx_ring_size - 1;
-    lp->tx_len_bits = (PCNET32_LOG_TX_BUFFERS << 12);
-    lp->rx_len_bits = (PCNET32_LOG_RX_BUFFERS << 4);
-    lp->mii_if.full_duplex = fdx;
-    lp->mii_if.phy_id_mask = 0x1f;
-    lp->mii_if.reg_num_mask = 0x1f;
-    lp->dxsuflo = dxsuflo;
-    lp->mii = mii;
-    lp->msg_enable = pcnet32_debug;
-    if ((cards_found >= MAX_UNITS) || (options[cards_found] > sizeof(options_mapping)))
-	lp->options = PCNET32_PORT_ASEL;
-    else
-	lp->options = options_mapping[options[cards_found]];
-    lp->mii_if.dev = dev;
-    lp->mii_if.mdio_read = mdio_read;
-    lp->mii_if.mdio_write = mdio_write;
-
-    if (fdx && !(lp->options & PCNET32_PORT_ASEL) &&
-		((cards_found>=MAX_UNITS) || full_duplex[cards_found]))
-	lp->options |= PCNET32_PORT_FD;
-
-    if (!a) {
-	if (pcnet32_debug & NETIF_MSG_PROBE)
-	    printk(KERN_ERR PFX "No access methods\n");
-	ret = -ENODEV;
-	goto err_free_consistent;
-    }
-    lp->a = *a;
-
-    /* prior to register_netdev, dev->name is not yet correct */
-    if (pcnet32_alloc_ring(dev, pci_name(lp->pci_dev))) {
-	ret = -ENOMEM;
-	goto err_free_ring;
-    }
-    /* detect special T1/E1 WAN card by checking for MAC address */
-    if (dev->dev_addr[0] == 0x00 && dev->dev_addr[1] == 0xe0
+		promaddr[i] = inb(ioaddr + i);
+
+	if (memcmp(promaddr, dev->dev_addr, 6)
+	    || !is_valid_ether_addr(dev->dev_addr)) {
+		if (is_valid_ether_addr(promaddr)) {
+			if (pcnet32_debug & NETIF_MSG_PROBE) {
+				printk(" warning: CSR address invalid,\n");
+				printk(KERN_INFO
+				       "    using instead PROM address of");
+			}
+			memcpy(dev->dev_addr, promaddr, 6);
+		}
+	}
+	memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
+
+	/* if the ethernet address is not valid, force to 00:00:00:00:00:00 */
+	if (!is_valid_ether_addr(dev->perm_addr))
+		memset(dev->dev_addr, 0, sizeof(dev->dev_addr));
+
+	if (pcnet32_debug & NETIF_MSG_PROBE) {
+		for (i = 0; i < 6; i++)
+			printk(" %2.2x", dev->dev_addr[i]);
+
+		/* Version 0x2623 and 0x2624 */
+		if (((chip_version + 1) & 0xfffe) == 0x2624) {
+			i = a->read_csr(ioaddr, 80) & 0x0C00;	/* Check tx_start_pt */
+			printk("\n" KERN_INFO "    tx_start_pt(0x%04x):", i);
+			switch (i >> 10) {
+			case 0:
+				printk("  20 bytes,");
+				break;
+			case 1:
+				printk("  64 bytes,");
+				break;
+			case 2:
+				printk(" 128 bytes,");
+				break;
+			case 3:
+				printk("~220 bytes,");
+				break;
+			}
+			i = a->read_bcr(ioaddr, 18);	/* Check Burst/Bus control */
+			printk(" BCR18(%x):", i & 0xffff);
+			if (i & (1 << 5))
+				printk("BurstWrEn ");
+			if (i & (1 << 6))
+				printk("BurstRdEn ");
+			if (i & (1 << 7))
+				printk("DWordIO ");
+			if (i & (1 << 11))
+				printk("NoUFlow ");
+			i = a->read_bcr(ioaddr, 25);
+			printk("\n" KERN_INFO "    SRAMSIZE=0x%04x,", i << 8);
+			i = a->read_bcr(ioaddr, 26);
+			printk(" SRAM_BND=0x%04x,", i << 8);
+			i = a->read_bcr(ioaddr, 27);
+			if (i & (1 << 14))
+				printk("LowLatRx");
+		}
+	}
+
+	dev->base_addr = ioaddr;
+	/* pci_alloc_consistent returns page-aligned memory, so we do not have to check the alignment */
+	if ((lp =
+	     pci_alloc_consistent(pdev, sizeof(*lp), &lp_dma_addr)) == NULL) {
+		if (pcnet32_debug & NETIF_MSG_PROBE)
+			printk(KERN_ERR PFX
+			       "Consistent memory allocation failed.\n");
+		ret = -ENOMEM;
+		goto err_free_netdev;
+	}
+
+	memset(lp, 0, sizeof(*lp));
+	lp->dma_addr = lp_dma_addr;
+	lp->pci_dev = pdev;
+
+	spin_lock_init(&lp->lock);
+
+	SET_MODULE_OWNER(dev);
+	SET_NETDEV_DEV(dev, &pdev->dev);
+	dev->priv = lp;
+	lp->name = chipname;
+	lp->shared_irq = shared;
+	lp->tx_ring_size = TX_RING_SIZE;	/* default tx ring size */
+	lp->rx_ring_size = RX_RING_SIZE;	/* default rx ring size */
+	lp->tx_mod_mask = lp->tx_ring_size - 1;
+	lp->rx_mod_mask = lp->rx_ring_size - 1;
+	lp->tx_len_bits = (PCNET32_LOG_TX_BUFFERS << 12);
+	lp->rx_len_bits = (PCNET32_LOG_RX_BUFFERS << 4);
+	lp->mii_if.full_duplex = fdx;
+	lp->mii_if.phy_id_mask = 0x1f;
+	lp->mii_if.reg_num_mask = 0x1f;
+	lp->dxsuflo = dxsuflo;
+	lp->mii = mii;
+	lp->msg_enable = pcnet32_debug;
+	if ((cards_found >= MAX_UNITS)
+	    || (options[cards_found] > sizeof(options_mapping)))
+		lp->options = PCNET32_PORT_ASEL;
+	else
+		lp->options = options_mapping[options[cards_found]];
+	lp->mii_if.dev = dev;
+	lp->mii_if.mdio_read = mdio_read;
+	lp->mii_if.mdio_write = mdio_write;
+
+	if (fdx && !(lp->options & PCNET32_PORT_ASEL) &&
+	    ((cards_found >= MAX_UNITS) || full_duplex[cards_found]))
+		lp->options |= PCNET32_PORT_FD;
+
+	if (!a) {
+		if (pcnet32_debug & NETIF_MSG_PROBE)
+			printk(KERN_ERR PFX "No access methods\n");
+		ret = -ENODEV;
+		goto err_free_consistent;
+	}
+	lp->a = *a;
+
+	/* prior to register_netdev, dev->name is not yet correct */
+	if (pcnet32_alloc_ring(dev, pci_name(lp->pci_dev))) {
+		ret = -ENOMEM;
+		goto err_free_ring;
+	}
+	/* detect special T1/E1 WAN card by checking for MAC address */
+	if (dev->dev_addr[0] == 0x00 && dev->dev_addr[1] == 0xe0
 	    && dev->dev_addr[2] == 0x75)
-	lp->options = PCNET32_PORT_FD | PCNET32_PORT_GPSI;
-
-    lp->init_block.mode = le16_to_cpu(0x0003);	/* Disable Rx and Tx. */
-    lp->init_block.tlen_rlen = le16_to_cpu(lp->tx_len_bits | lp->rx_len_bits);
-    for (i = 0; i < 6; i++)
-	lp->init_block.phys_addr[i] = dev->dev_addr[i];
-    lp->init_block.filter[0] = 0x00000000;
-    lp->init_block.filter[1] = 0x00000000;
-    lp->init_block.rx_ring = (u32)le32_to_cpu(lp->rx_ring_dma_addr);
-    lp->init_block.tx_ring = (u32)le32_to_cpu(lp->tx_ring_dma_addr);
-
-    /* switch pcnet32 to 32bit mode */
-    a->write_bcr(ioaddr, 20, 2);
-
-    a->write_csr(ioaddr, 1, (lp->dma_addr + offsetof(struct pcnet32_private,
-		    init_block)) & 0xffff);
-    a->write_csr(ioaddr, 2, (lp->dma_addr + offsetof(struct pcnet32_private,
-		    init_block)) >> 16);
-
-    if (pdev) {		/* use the IRQ provided by PCI */
-	dev->irq = pdev->irq;
-	if (pcnet32_debug & NETIF_MSG_PROBE)
-	    printk(" assigned IRQ %d.\n", dev->irq);
-    } else {
-	unsigned long irq_mask = probe_irq_on();
+		lp->options = PCNET32_PORT_FD | PCNET32_PORT_GPSI;
 
-	/*
-	 * To auto-IRQ we enable the initialization-done and DMA error
-	 * interrupts. For ISA boards we get a DMA error, but VLB and PCI
-	 * boards will work.
-	 */
-	/* Trigger an initialization just for the interrupt. */
-	a->write_csr (ioaddr, 0, 0x41);
-	mdelay (1);
+	lp->init_block.mode = le16_to_cpu(0x0003);	/* Disable Rx and Tx. */
+	lp->init_block.tlen_rlen =
+	    le16_to_cpu(lp->tx_len_bits | lp->rx_len_bits);
+	for (i = 0; i < 6; i++)
+		lp->init_block.phys_addr[i] = dev->dev_addr[i];
+	lp->init_block.filter[0] = 0x00000000;
+	lp->init_block.filter[1] = 0x00000000;
+	lp->init_block.rx_ring = (u32) le32_to_cpu(lp->rx_ring_dma_addr);
+	lp->init_block.tx_ring = (u32) le32_to_cpu(lp->tx_ring_dma_addr);
+
+	/* switch pcnet32 to 32bit mode */
+	a->write_bcr(ioaddr, 20, 2);
+
+	a->write_csr(ioaddr, 1, (lp->dma_addr + offsetof(struct pcnet32_private,
+							 init_block)) & 0xffff);
+	a->write_csr(ioaddr, 2, (lp->dma_addr + offsetof(struct pcnet32_private,
+							 init_block)) >> 16);
+
+	if (pdev) {		/* use the IRQ provided by PCI */
+		dev->irq = pdev->irq;
+		if (pcnet32_debug & NETIF_MSG_PROBE)
+			printk(" assigned IRQ %d.\n", dev->irq);
+	} else {
+		unsigned long irq_mask = probe_irq_on();
+
+		/*
+		 * To auto-IRQ we enable the initialization-done and DMA error
+		 * interrupts. For ISA boards we get a DMA error, but VLB and PCI
+		 * boards will work.
+		 */
+		/* Trigger an initialization just for the interrupt. */
+		a->write_csr(ioaddr, 0, 0x41);
+		mdelay(1);
+
+		dev->irq = probe_irq_off(irq_mask);
+		if (!dev->irq) {
+			if (pcnet32_debug & NETIF_MSG_PROBE)
+				printk(", failed to detect IRQ line.\n");
+			ret = -ENODEV;
+			goto err_free_ring;
+		}
+		if (pcnet32_debug & NETIF_MSG_PROBE)
+			printk(", probed IRQ %d.\n", dev->irq);
+	}
 
-	dev->irq = probe_irq_off (irq_mask);
-	if (!dev->irq) {
-	    if (pcnet32_debug & NETIF_MSG_PROBE)
-		printk(", failed to detect IRQ line.\n");
-	    ret = -ENODEV;
-	    goto err_free_ring;
+	/* Set the mii phy_id so that we can query the link state */
+	if (lp->mii) {
+		/* lp->phycount and lp->phymask are set to 0 by memset above */
+
+		lp->mii_if.phy_id = ((lp->a.read_bcr(ioaddr, 33)) >> 5) & 0x1f;
+		/* scan for PHYs */
+		for (i = 0; i < PCNET32_MAX_PHYS; i++) {
+			unsigned short id1, id2;
+
+			id1 = mdio_read(dev, i, MII_PHYSID1);
+			if (id1 == 0xffff)
+				continue;
+			id2 = mdio_read(dev, i, MII_PHYSID2);
+			if (id2 == 0xffff)
+				continue;
+			if (i == 31 && ((chip_version + 1) & 0xfffe) == 0x2624)
+				continue;	/* 79C971 & 79C972 have phantom phy at id 31 */
+			lp->phycount++;
+			lp->phymask |= (1 << i);
+			lp->mii_if.phy_id = i;
+			if (pcnet32_debug & NETIF_MSG_PROBE)
+				printk(KERN_INFO PFX
+				       "Found PHY %04x:%04x at address %d.\n",
+				       id1, id2, i);
+		}
+		lp->a.write_bcr(ioaddr, 33, (lp->mii_if.phy_id) << 5);
+		if (lp->phycount > 1) {
+			lp->options |= PCNET32_PORT_MII;
+		}
 	}
-	if (pcnet32_debug & NETIF_MSG_PROBE)
-	    printk(", probed IRQ %d.\n", dev->irq);
-    }
-
-    /* Set the mii phy_id so that we can query the link state */
-    if (lp->mii)
-	lp->mii_if.phy_id = ((lp->a.read_bcr (ioaddr, 33)) >> 5) & 0x1f;
-
-    init_timer (&lp->watchdog_timer);
-    lp->watchdog_timer.data = (unsigned long) dev;
-    lp->watchdog_timer.function = (void *) &pcnet32_watchdog;
-
-    /* The PCNET32-specific entries in the device structure. */
-    dev->open = &pcnet32_open;
-    dev->hard_start_xmit = &pcnet32_start_xmit;
-    dev->stop = &pcnet32_close;
-    dev->get_stats = &pcnet32_get_stats;
-    dev->set_multicast_list = &pcnet32_set_multicast_list;
-    dev->do_ioctl = &pcnet32_ioctl;
-    dev->ethtool_ops = &pcnet32_ethtool_ops;
-    dev->tx_timeout = pcnet32_tx_timeout;
-    dev->watchdog_timeo = (5*HZ);
+
+	init_timer(&lp->watchdog_timer);
+	lp->watchdog_timer.data = (unsigned long)dev;
+	lp->watchdog_timer.function = (void *)&pcnet32_watchdog;
+
+	/* The PCNET32-specific entries in the device structure. */
+	dev->open = &pcnet32_open;
+	dev->hard_start_xmit = &pcnet32_start_xmit;
+	dev->stop = &pcnet32_close;
+	dev->get_stats = &pcnet32_get_stats;
+	dev->set_multicast_list = &pcnet32_set_multicast_list;
+	dev->do_ioctl = &pcnet32_ioctl;
+	dev->ethtool_ops = &pcnet32_ethtool_ops;
+	dev->tx_timeout = pcnet32_tx_timeout;
+	dev->watchdog_timeo = (5 * HZ);
 
 #ifdef CONFIG_NET_POLL_CONTROLLER
-    dev->poll_controller = pcnet32_poll_controller;
+	dev->poll_controller = pcnet32_poll_controller;
 #endif
 
-    /* Fill in the generic fields of the device structure. */
-    if (register_netdev(dev))
-	goto err_free_ring;
-
-    if (pdev) {
-	pci_set_drvdata(pdev, dev);
-    } else {
-	lp->next = pcnet32_dev;
-	pcnet32_dev = dev;
-    }
-
-    if (pcnet32_debug & NETIF_MSG_PROBE)
-	printk(KERN_INFO "%s: registered as %s\n", dev->name, lp->name);
-    cards_found++;
-
-    /* enable LED writes */
-    a->write_bcr(ioaddr, 2, a->read_bcr(ioaddr, 2) | 0x1000);
-
-    return 0;
-
-err_free_ring:
-    pcnet32_free_ring(dev);
-err_free_consistent:
-    pci_free_consistent(lp->pci_dev, sizeof(*lp), lp, lp->dma_addr);
-err_free_netdev:
-    free_netdev(dev);
-err_release_region:
-    release_region(ioaddr, PCNET32_TOTAL_SIZE);
-    return ret;
-}
+	/* Fill in the generic fields of the device structure. */
+	if (register_netdev(dev))
+		goto err_free_ring;
+
+	if (pdev) {
+		pci_set_drvdata(pdev, dev);
+	} else {
+		lp->next = pcnet32_dev;
+		pcnet32_dev = dev;
+	}
 
+	if (pcnet32_debug & NETIF_MSG_PROBE)
+		printk(KERN_INFO "%s: registered as %s\n", dev->name, lp->name);
+	cards_found++;
+
+	/* enable LED writes */
+	a->write_bcr(ioaddr, 2, a->read_bcr(ioaddr, 2) | 0x1000);
+
+	return 0;
+
+      err_free_ring:
+	pcnet32_free_ring(dev);
+      err_free_consistent:
+	pci_free_consistent(lp->pci_dev, sizeof(*lp), lp, lp->dma_addr);
+      err_free_netdev:
+	free_netdev(dev);
+      err_release_region:
+	release_region(ioaddr, PCNET32_TOTAL_SIZE);
+	return ret;
+}
 
 /* if any allocation fails, caller must also call pcnet32_free_ring */
 static int pcnet32_alloc_ring(struct net_device *dev, char *name)
 {
-    struct pcnet32_private *lp = dev->priv;
+	struct pcnet32_private *lp = dev->priv;
 
-    lp->tx_ring = pci_alloc_consistent(lp->pci_dev,
-	    sizeof(struct pcnet32_tx_head) * lp->tx_ring_size,
-	    &lp->tx_ring_dma_addr);
-    if (lp->tx_ring == NULL) {
-	if (pcnet32_debug & NETIF_MSG_DRV)
-	    printk("\n" KERN_ERR PFX "%s: Consistent memory allocation failed.\n",
-		    name);
-	return -ENOMEM;
-    }
-
-    lp->rx_ring = pci_alloc_consistent(lp->pci_dev,
-	    sizeof(struct pcnet32_rx_head) * lp->rx_ring_size,
-	    &lp->rx_ring_dma_addr);
-    if (lp->rx_ring == NULL) {
-	if (pcnet32_debug & NETIF_MSG_DRV)
-	    printk("\n" KERN_ERR PFX "%s: Consistent memory allocation failed.\n",
-		    name);
-	return -ENOMEM;
-    }
-
-    lp->tx_dma_addr = kmalloc(sizeof(dma_addr_t) * lp->tx_ring_size,
-	    GFP_ATOMIC);
-    if (!lp->tx_dma_addr) {
-	if (pcnet32_debug & NETIF_MSG_DRV)
-	    printk("\n" KERN_ERR PFX "%s: Memory allocation failed.\n", name);
-	return -ENOMEM;
-    }
-    memset(lp->tx_dma_addr, 0, sizeof(dma_addr_t) * lp->tx_ring_size);
-
-    lp->rx_dma_addr = kmalloc(sizeof(dma_addr_t) * lp->rx_ring_size,
-	    GFP_ATOMIC);
-    if (!lp->rx_dma_addr) {
-	if (pcnet32_debug & NETIF_MSG_DRV)
-	    printk("\n" KERN_ERR PFX "%s: Memory allocation failed.\n", name);
-	return -ENOMEM;
-    }
-    memset(lp->rx_dma_addr, 0, sizeof(dma_addr_t) * lp->rx_ring_size);
-
-    lp->tx_skbuff = kmalloc(sizeof(struct sk_buff *) * lp->tx_ring_size,
-	    GFP_ATOMIC);
-    if (!lp->tx_skbuff) {
-	if (pcnet32_debug & NETIF_MSG_DRV)
-	    printk("\n" KERN_ERR PFX "%s: Memory allocation failed.\n", name);
-	return -ENOMEM;
-    }
-    memset(lp->tx_skbuff, 0, sizeof(struct sk_buff *) * lp->tx_ring_size);
-
-    lp->rx_skbuff = kmalloc(sizeof(struct sk_buff *) * lp->rx_ring_size,
-	    GFP_ATOMIC);
-    if (!lp->rx_skbuff) {
-	if (pcnet32_debug & NETIF_MSG_DRV)
-	    printk("\n" KERN_ERR PFX "%s: Memory allocation failed.\n", name);
-	return -ENOMEM;
-    }
-    memset(lp->rx_skbuff, 0, sizeof(struct sk_buff *) * lp->rx_ring_size);
+	lp->tx_ring = pci_alloc_consistent(lp->pci_dev,
+					   sizeof(struct pcnet32_tx_head) *
+					   lp->tx_ring_size,
+					   &lp->tx_ring_dma_addr);
+	if (lp->tx_ring == NULL) {
+		if (pcnet32_debug & NETIF_MSG_DRV)
+			printk("\n" KERN_ERR PFX
+			       "%s: Consistent memory allocation failed.\n",
+			       name);
+		return -ENOMEM;
+	}
 
-    return 0;
-}
+	lp->rx_ring = pci_alloc_consistent(lp->pci_dev,
+					   sizeof(struct pcnet32_rx_head) *
+					   lp->rx_ring_size,
+					   &lp->rx_ring_dma_addr);
+	if (lp->rx_ring == NULL) {
+		if (pcnet32_debug & NETIF_MSG_DRV)
+			printk("\n" KERN_ERR PFX
+			       "%s: Consistent memory allocation failed.\n",
+			       name);
+		return -ENOMEM;
+	}
 
+	lp->tx_dma_addr = kmalloc(sizeof(dma_addr_t) * lp->tx_ring_size,
+				  GFP_ATOMIC);
+	if (!lp->tx_dma_addr) {
+		if (pcnet32_debug & NETIF_MSG_DRV)
+			printk("\n" KERN_ERR PFX
+			       "%s: Memory allocation failed.\n", name);
+		return -ENOMEM;
+	}
+	memset(lp->tx_dma_addr, 0, sizeof(dma_addr_t) * lp->tx_ring_size);
+
+	lp->rx_dma_addr = kmalloc(sizeof(dma_addr_t) * lp->rx_ring_size,
+				  GFP_ATOMIC);
+	if (!lp->rx_dma_addr) {
+		if (pcnet32_debug & NETIF_MSG_DRV)
+			printk("\n" KERN_ERR PFX
+			       "%s: Memory allocation failed.\n", name);
+		return -ENOMEM;
+	}
+	memset(lp->rx_dma_addr, 0, sizeof(dma_addr_t) * lp->rx_ring_size);
+
+	lp->tx_skbuff = kmalloc(sizeof(struct sk_buff *) * lp->tx_ring_size,
+				GFP_ATOMIC);
+	if (!lp->tx_skbuff) {
+		if (pcnet32_debug & NETIF_MSG_DRV)
+			printk("\n" KERN_ERR PFX
+			       "%s: Memory allocation failed.\n", name);
+		return -ENOMEM;
+	}
+	memset(lp->tx_skbuff, 0, sizeof(struct sk_buff *) * lp->tx_ring_size);
+
+	lp->rx_skbuff = kmalloc(sizeof(struct sk_buff *) * lp->rx_ring_size,
+				GFP_ATOMIC);
+	if (!lp->rx_skbuff) {
+		if (pcnet32_debug & NETIF_MSG_DRV)
+			printk("\n" KERN_ERR PFX
+			       "%s: Memory allocation failed.\n", name);
+		return -ENOMEM;
+	}
+	memset(lp->rx_skbuff, 0, sizeof(struct sk_buff *) * lp->rx_ring_size);
+
+	return 0;
+}
 
 static void pcnet32_free_ring(struct net_device *dev)
 {
-    struct pcnet32_private *lp = dev->priv;
+	struct pcnet32_private *lp = dev->priv;
 
-    kfree(lp->tx_skbuff);
-    lp->tx_skbuff = NULL;
+	kfree(lp->tx_skbuff);
+	lp->tx_skbuff = NULL;
 
-    kfree(lp->rx_skbuff);
-    lp->rx_skbuff = NULL;
+	kfree(lp->rx_skbuff);
+	lp->rx_skbuff = NULL;
 
-    kfree(lp->tx_dma_addr);
-    lp->tx_dma_addr = NULL;
+	kfree(lp->tx_dma_addr);
+	lp->tx_dma_addr = NULL;
 
-    kfree(lp->rx_dma_addr);
-    lp->rx_dma_addr = NULL;
+	kfree(lp->rx_dma_addr);
+	lp->rx_dma_addr = NULL;
 
-    if (lp->tx_ring) {
-	pci_free_consistent(lp->pci_dev, sizeof(struct pcnet32_tx_head) * lp->tx_ring_size,
-		lp->tx_ring, lp->tx_ring_dma_addr);
-	lp->tx_ring = NULL;
-    }
+	if (lp->tx_ring) {
+		pci_free_consistent(lp->pci_dev,
+				    sizeof(struct pcnet32_tx_head) *
+				    lp->tx_ring_size, lp->tx_ring,
+				    lp->tx_ring_dma_addr);
+		lp->tx_ring = NULL;
+	}
 
-    if (lp->rx_ring) {
-	pci_free_consistent(lp->pci_dev, sizeof(struct pcnet32_rx_head) * lp->rx_ring_size,
-		lp->rx_ring, lp->rx_ring_dma_addr);
-	lp->rx_ring = NULL;
-    }
+	if (lp->rx_ring) {
+		pci_free_consistent(lp->pci_dev,
+				    sizeof(struct pcnet32_rx_head) *
+				    lp->rx_ring_size, lp->rx_ring,
+				    lp->rx_ring_dma_addr);
+		lp->rx_ring = NULL;
+	}
 }
 
-
-static int
-pcnet32_open(struct net_device *dev)
+static int pcnet32_open(struct net_device *dev)
 {
-    struct pcnet32_private *lp = dev->priv;
-    unsigned long ioaddr = dev->base_addr;
-    u16 val;
-    int i;
-    int rc;
-    unsigned long flags;
-
-    if (request_irq(dev->irq, &pcnet32_interrupt,
-		    lp->shared_irq ? SA_SHIRQ : 0, dev->name, (void *)dev)) {
-	return -EAGAIN;
-    }
-
-    spin_lock_irqsave(&lp->lock, flags);
-    /* Check for a valid station address */
-    if (!is_valid_ether_addr(dev->dev_addr)) {
-	rc = -EINVAL;
-	goto err_free_irq;
-    }
-
-    /* Reset the PCNET32 */
-    lp->a.reset (ioaddr);
-
-    /* switch pcnet32 to 32bit mode */
-    lp->a.write_bcr (ioaddr, 20, 2);
-
-    if (netif_msg_ifup(lp))
-	printk(KERN_DEBUG "%s: pcnet32_open() irq %d tx/rx rings %#x/%#x init %#x.\n",
-	       dev->name, dev->irq,
-	       (u32) (lp->tx_ring_dma_addr),
-	       (u32) (lp->rx_ring_dma_addr),
-	       (u32) (lp->dma_addr + offsetof(struct pcnet32_private, init_block)));
-
-    /* set/reset autoselect bit */
-    val = lp->a.read_bcr (ioaddr, 2) & ~2;
-    if (lp->options & PCNET32_PORT_ASEL)
-	val |= 2;
-    lp->a.write_bcr (ioaddr, 2, val);
-
-    /* handle full duplex setting */
-    if (lp->mii_if.full_duplex) {
-	val = lp->a.read_bcr (ioaddr, 9) & ~3;
-	if (lp->options & PCNET32_PORT_FD) {
-	    val |= 1;
-	    if (lp->options == (PCNET32_PORT_FD | PCNET32_PORT_AUI))
+	struct pcnet32_private *lp = dev->priv;
+	unsigned long ioaddr = dev->base_addr;
+	u16 val;
+	int i;
+	int rc;
+	unsigned long flags;
+
+	if (request_irq(dev->irq, &pcnet32_interrupt,
+			lp->shared_irq ? SA_SHIRQ : 0, dev->name,
+			(void *)dev)) {
+		return -EAGAIN;
+	}
+
+	spin_lock_irqsave(&lp->lock, flags);
+	/* Check for a valid station address */
+	if (!is_valid_ether_addr(dev->dev_addr)) {
+		rc = -EINVAL;
+		goto err_free_irq;
+	}
+
+	/* Reset the PCNET32 */
+	lp->a.reset(ioaddr);
+
+	/* switch pcnet32 to 32bit mode */
+	lp->a.write_bcr(ioaddr, 20, 2);
+
+	if (netif_msg_ifup(lp))
+		printk(KERN_DEBUG
+		       "%s: pcnet32_open() irq %d tx/rx rings %#x/%#x init %#x.\n",
+		       dev->name, dev->irq, (u32) (lp->tx_ring_dma_addr),
+		       (u32) (lp->rx_ring_dma_addr),
+		       (u32) (lp->dma_addr +
+			      offsetof(struct pcnet32_private, init_block)));
+
+	/* set/reset autoselect bit */
+	val = lp->a.read_bcr(ioaddr, 2) & ~2;
+	if (lp->options & PCNET32_PORT_ASEL)
 		val |= 2;
-	} else if (lp->options & PCNET32_PORT_ASEL) {
-	/* workaround of xSeries250, turn on for 79C975 only */
-	    i = ((lp->a.read_csr(ioaddr, 88) |
-			(lp->a.read_csr(ioaddr,89) << 16)) >> 12) & 0xffff;
-	    if (i == 0x2627)
-		val |= 3;
-	}
-	lp->a.write_bcr (ioaddr, 9, val);
-    }
-
-    /* set/reset GPSI bit in test register */
-    val = lp->a.read_csr (ioaddr, 124) & ~0x10;
-    if ((lp->options & PCNET32_PORT_PORTSEL) == PCNET32_PORT_GPSI)
-	val |= 0x10;
-    lp->a.write_csr (ioaddr, 124, val);
-
-    /* Allied Telesyn AT 2700/2701 FX are 100Mbit only and do not negotiate */
-    if (lp->pci_dev->subsystem_vendor == PCI_VENDOR_ID_AT &&
+	lp->a.write_bcr(ioaddr, 2, val);
+
+	/* handle full duplex setting */
+	if (lp->mii_if.full_duplex) {
+		val = lp->a.read_bcr(ioaddr, 9) & ~3;
+		if (lp->options & PCNET32_PORT_FD) {
+			val |= 1;
+			if (lp->options == (PCNET32_PORT_FD | PCNET32_PORT_AUI))
+				val |= 2;
+		} else if (lp->options & PCNET32_PORT_ASEL) {
+			/* workaround of xSeries250, turn on for 79C975 only */
+			i = ((lp->a.read_csr(ioaddr, 88) |
+			      (lp->a.
+			       read_csr(ioaddr, 89) << 16)) >> 12) & 0xffff;
+			if (i == 0x2627)
+				val |= 3;
+		}
+		lp->a.write_bcr(ioaddr, 9, val);
+	}
+
+	/* set/reset GPSI bit in test register */
+	val = lp->a.read_csr(ioaddr, 124) & ~0x10;
+	if ((lp->options & PCNET32_PORT_PORTSEL) == PCNET32_PORT_GPSI)
+		val |= 0x10;
+	lp->a.write_csr(ioaddr, 124, val);
+
+	/* Allied Telesyn AT 2700/2701 FX are 100Mbit only and do not negotiate */
+	if (lp->pci_dev->subsystem_vendor == PCI_VENDOR_ID_AT &&
 	    (lp->pci_dev->subsystem_device == PCI_SUBDEVICE_ID_AT_2700FX ||
 	     lp->pci_dev->subsystem_device == PCI_SUBDEVICE_ID_AT_2701FX)) {
-	if (lp->options & PCNET32_PORT_ASEL) {
-	    lp->options = PCNET32_PORT_FD | PCNET32_PORT_100;
-	    if (netif_msg_link(lp))
-		printk(KERN_DEBUG "%s: Setting 100Mb-Full Duplex.\n",
-			dev->name);
-	}
-    }
-    {
-	/*
-	 * 24 Jun 2004 according AMD, in order to change the PHY,
-	 * DANAS (or DISPM for 79C976) must be set; then select the speed,
-	 * duplex, and/or enable auto negotiation, and clear DANAS
-	 */
-	if (lp->mii && !(lp->options & PCNET32_PORT_ASEL)) {
-	    lp->a.write_bcr(ioaddr, 32,
-				lp->a.read_bcr(ioaddr, 32) | 0x0080);
-	    /* disable Auto Negotiation, set 10Mpbs, HD */
-	    val = lp->a.read_bcr(ioaddr, 32) & ~0xb8;
-	    if (lp->options & PCNET32_PORT_FD)
-		val |= 0x10;
-	    if (lp->options & PCNET32_PORT_100)
-		val |= 0x08;
-	    lp->a.write_bcr (ioaddr, 32, val);
+		if (lp->options & PCNET32_PORT_ASEL) {
+			lp->options = PCNET32_PORT_FD | PCNET32_PORT_100;
+			if (netif_msg_link(lp))
+				printk(KERN_DEBUG
+				       "%s: Setting 100Mb-Full Duplex.\n",
+				       dev->name);
+		}
+	}
+	if (lp->phycount < 2) {
+		/*
+		 * 24 Jun 2004 according AMD, in order to change the PHY,
+		 * DANAS (or DISPM for 79C976) must be set; then select the speed,
+		 * duplex, and/or enable auto negotiation, and clear DANAS
+		 */
+		if (lp->mii && !(lp->options & PCNET32_PORT_ASEL)) {
+			lp->a.write_bcr(ioaddr, 32,
+					lp->a.read_bcr(ioaddr, 32) | 0x0080);
+			/* disable Auto Negotiation, set 10Mpbs, HD */
+			val = lp->a.read_bcr(ioaddr, 32) & ~0xb8;
+			if (lp->options & PCNET32_PORT_FD)
+				val |= 0x10;
+			if (lp->options & PCNET32_PORT_100)
+				val |= 0x08;
+			lp->a.write_bcr(ioaddr, 32, val);
+		} else {
+			if (lp->options & PCNET32_PORT_ASEL) {
+				lp->a.write_bcr(ioaddr, 32,
+						lp->a.read_bcr(ioaddr,
+							       32) | 0x0080);
+				/* enable auto negotiate, setup, disable fd */
+				val = lp->a.read_bcr(ioaddr, 32) & ~0x98;
+				val |= 0x20;
+				lp->a.write_bcr(ioaddr, 32, val);
+			}
+		}
 	} else {
-	    if (lp->options & PCNET32_PORT_ASEL) {
-		lp->a.write_bcr(ioaddr, 32,
-			lp->a.read_bcr(ioaddr, 32) | 0x0080);
-		/* enable auto negotiate, setup, disable fd */
-		val = lp->a.read_bcr(ioaddr, 32) & ~0x98;
-		val |= 0x20;
-		lp->a.write_bcr(ioaddr, 32, val);
-	    }
+		int first_phy = -1;
+		u16 bmcr;
+		u32 bcr9;
+		struct ethtool_cmd ecmd;
+
+		/*
+		 * There is really no good other way to handle multiple PHYs
+		 * other than turning off all automatics
+		 */
+		val = lp->a.read_bcr(ioaddr, 2);
+		lp->a.write_bcr(ioaddr, 2, val & ~2);
+		val = lp->a.read_bcr(ioaddr, 32);
+		lp->a.write_bcr(ioaddr, 32, val & ~(1 << 7));	/* stop MII manager */
+
+		if (!(lp->options & PCNET32_PORT_ASEL)) {
+			/* setup ecmd */
+			ecmd.port = PORT_MII;
+			ecmd.transceiver = XCVR_INTERNAL;
+			ecmd.autoneg = AUTONEG_DISABLE;
+			ecmd.speed =
+			    lp->
+			    options & PCNET32_PORT_100 ? SPEED_100 : SPEED_10;
+			bcr9 = lp->a.read_bcr(ioaddr, 9);
+
+			if (lp->options & PCNET32_PORT_FD) {
+				ecmd.duplex = DUPLEX_FULL;
+				bcr9 |= (1 << 0);
+			} else {
+				ecmd.duplex = DUPLEX_HALF;
+				bcr9 |= ~(1 << 0);
+			}
+			lp->a.write_bcr(ioaddr, 9, bcr9);
+		}
+
+		for (i = 0; i < PCNET32_MAX_PHYS; i++) {
+			if (lp->phymask & (1 << i)) {
+				/* isolate all but the first PHY */
+				bmcr = mdio_read(dev, i, MII_BMCR);
+				if (first_phy == -1) {
+					first_phy = i;
+					mdio_write(dev, i, MII_BMCR,
+						   bmcr & ~BMCR_ISOLATE);
+				} else {
+					mdio_write(dev, i, MII_BMCR,
+						   bmcr | BMCR_ISOLATE);
+				}
+				/* use mii_ethtool_sset to setup PHY */
+				lp->mii_if.phy_id = i;
+				ecmd.phy_address = i;
+				if (lp->options & PCNET32_PORT_ASEL) {
+					mii_ethtool_gset(&lp->mii_if, &ecmd);
+					ecmd.autoneg = AUTONEG_ENABLE;
+				}
+				mii_ethtool_sset(&lp->mii_if, &ecmd);
+			}
+		}
+		lp->mii_if.phy_id = first_phy;
+		if (netif_msg_link(lp))
+			printk(KERN_INFO "%s: Using PHY number %d.\n",
+			       dev->name, first_phy);
 	}
-    }
 
 #ifdef DO_DXSUFLO
-    if (lp->dxsuflo) { /* Disable transmit stop on underflow */
-	val = lp->a.read_csr (ioaddr, 3);
-	val |= 0x40;
-	lp->a.write_csr (ioaddr, 3, val);
-    }
+	if (lp->dxsuflo) {	/* Disable transmit stop on underflow */
+		val = lp->a.read_csr(ioaddr, 3);
+		val |= 0x40;
+		lp->a.write_csr(ioaddr, 3, val);
+	}
 #endif
 
-    lp->init_block.mode = le16_to_cpu((lp->options & PCNET32_PORT_PORTSEL) << 7);
-    pcnet32_load_multicast(dev);
-
-    if (pcnet32_init_ring(dev)) {
-	rc = -ENOMEM;
-	goto err_free_ring;
-    }
-
-    /* Re-initialize the PCNET32, and start it when done. */
-    lp->a.write_csr (ioaddr, 1, (lp->dma_addr +
-		offsetof(struct pcnet32_private, init_block)) & 0xffff);
-    lp->a.write_csr (ioaddr, 2, (lp->dma_addr +
-		offsetof(struct pcnet32_private, init_block)) >> 16);
-
-    lp->a.write_csr (ioaddr, 4, 0x0915);
-    lp->a.write_csr (ioaddr, 0, 0x0001);
-
-    netif_start_queue(dev);
-
-    /* If we have mii, print the link status and start the watchdog */
-    if (lp->mii) {
-	mii_check_media (&lp->mii_if, netif_msg_link(lp), 1);
-	mod_timer (&(lp->watchdog_timer), PCNET32_WATCHDOG_TIMEOUT);
-    }
-
-    i = 0;
-    while (i++ < 100)
-	if (lp->a.read_csr (ioaddr, 0) & 0x0100)
-	    break;
-    /*
-     * We used to clear the InitDone bit, 0x0100, here but Mark Stockton
-     * reports that doing so triggers a bug in the '974.
-     */
-    lp->a.write_csr (ioaddr, 0, 0x0042);
-
-    if (netif_msg_ifup(lp))
-	printk(KERN_DEBUG "%s: pcnet32 open after %d ticks, init block %#x csr0 %4.4x.\n",
-		dev->name, i, (u32) (lp->dma_addr +
-		    offsetof(struct pcnet32_private, init_block)),
-		lp->a.read_csr(ioaddr, 0));
-
-    spin_unlock_irqrestore(&lp->lock, flags);
-
-    return 0;	/* Always succeed */
-
-err_free_ring:
-    /* free any allocated skbuffs */
-    for (i = 0; i < lp->rx_ring_size; i++) {
-	lp->rx_ring[i].status = 0;
-	if (lp->rx_skbuff[i]) {
-	    pci_unmap_single(lp->pci_dev, lp->rx_dma_addr[i], PKT_BUF_SZ-2,
-		    PCI_DMA_FROMDEVICE);
-	    dev_kfree_skb(lp->rx_skbuff[i]);
-	}
-	lp->rx_skbuff[i] = NULL;
-	lp->rx_dma_addr[i] = 0;
-    }
-
-    pcnet32_free_ring(dev);
-
-    /*
-     * Switch back to 16bit mode to avoid problems with dumb
-     * DOS packet driver after a warm reboot
-     */
-    lp->a.write_bcr (ioaddr, 20, 4);
-
-err_free_irq:
-    spin_unlock_irqrestore(&lp->lock, flags);
-    free_irq(dev->irq, dev);
-    return rc;
+	lp->init_block.mode =
+	    le16_to_cpu((lp->options & PCNET32_PORT_PORTSEL) << 7);
+	pcnet32_load_multicast(dev);
+
+	if (pcnet32_init_ring(dev)) {
+		rc = -ENOMEM;
+		goto err_free_ring;
+	}
+
+	/* Re-initialize the PCNET32, and start it when done. */
+	lp->a.write_csr(ioaddr, 1, (lp->dma_addr +
+				    offsetof(struct pcnet32_private,
+					     init_block)) & 0xffff);
+	lp->a.write_csr(ioaddr, 2,
+			(lp->dma_addr +
+			 offsetof(struct pcnet32_private, init_block)) >> 16);
+
+	lp->a.write_csr(ioaddr, 4, 0x0915);
+	lp->a.write_csr(ioaddr, 0, 0x0001);
+
+	netif_start_queue(dev);
+
+	/* Print the link status and start the watchdog */
+	pcnet32_check_media(dev, 1);
+	mod_timer(&(lp->watchdog_timer), PCNET32_WATCHDOG_TIMEOUT);
+
+	i = 0;
+	while (i++ < 100)
+		if (lp->a.read_csr(ioaddr, 0) & 0x0100)
+			break;
+	/*
+	 * We used to clear the InitDone bit, 0x0100, here but Mark Stockton
+	 * reports that doing so triggers a bug in the '974.
+	 */
+	lp->a.write_csr(ioaddr, 0, 0x0042);
+
+	if (netif_msg_ifup(lp))
+		printk(KERN_DEBUG
+		       "%s: pcnet32 open after %d ticks, init block %#x csr0 %4.4x.\n",
+		       dev->name, i,
+		       (u32) (lp->dma_addr +
+			      offsetof(struct pcnet32_private, init_block)),
+		       lp->a.read_csr(ioaddr, 0));
+
+	spin_unlock_irqrestore(&lp->lock, flags);
+
+	return 0;		/* Always succeed */
+
+      err_free_ring:
+	/* free any allocated skbuffs */
+	for (i = 0; i < lp->rx_ring_size; i++) {
+		lp->rx_ring[i].status = 0;
+		if (lp->rx_skbuff[i]) {
+			pci_unmap_single(lp->pci_dev, lp->rx_dma_addr[i],
+					 PKT_BUF_SZ - 2, PCI_DMA_FROMDEVICE);
+			dev_kfree_skb(lp->rx_skbuff[i]);
+		}
+		lp->rx_skbuff[i] = NULL;
+		lp->rx_dma_addr[i] = 0;
+	}
+
+	pcnet32_free_ring(dev);
+
+	/*
+	 * Switch back to 16bit mode to avoid problems with dumb
+	 * DOS packet driver after a warm reboot
+	 */
+	lp->a.write_bcr(ioaddr, 20, 4);
+
+      err_free_irq:
+	spin_unlock_irqrestore(&lp->lock, flags);
+	free_irq(dev->irq, dev);
+	return rc;
 }
 
 /*
@@ -1746,727 +1801,893 @@ err_free_irq:
  * restarting the chip, but I'm too lazy to do so right now.  dplatt@3do.com
  */
 
-static void
-pcnet32_purge_tx_ring(struct net_device *dev)
+static void pcnet32_purge_tx_ring(struct net_device *dev)
 {
-    struct pcnet32_private *lp = dev->priv;
-    int i;
-
-    for (i = 0; i < lp->tx_ring_size; i++) {
-	lp->tx_ring[i].status = 0;	/* CPU owns buffer */
-	wmb();	/* Make sure adapter sees owner change */
-	if (lp->tx_skbuff[i]) {
-	    pci_unmap_single(lp->pci_dev, lp->tx_dma_addr[i],
-		    lp->tx_skbuff[i]->len, PCI_DMA_TODEVICE);
-	    dev_kfree_skb_any(lp->tx_skbuff[i]);
-	}
-	lp->tx_skbuff[i] = NULL;
-	lp->tx_dma_addr[i] = 0;
-    }
-}
+	struct pcnet32_private *lp = dev->priv;
+	int i;
 
+	for (i = 0; i < lp->tx_ring_size; i++) {
+		lp->tx_ring[i].status = 0;	/* CPU owns buffer */
+		wmb();		/* Make sure adapter sees owner change */
+		if (lp->tx_skbuff[i]) {
+			pci_unmap_single(lp->pci_dev, lp->tx_dma_addr[i],
+					 lp->tx_skbuff[i]->len,
+					 PCI_DMA_TODEVICE);
+			dev_kfree_skb_any(lp->tx_skbuff[i]);
+		}
+		lp->tx_skbuff[i] = NULL;
+		lp->tx_dma_addr[i] = 0;
+	}
+}
 
 /* Initialize the PCNET32 Rx and Tx rings. */
-static int
-pcnet32_init_ring(struct net_device *dev)
+static int pcnet32_init_ring(struct net_device *dev)
 {
-    struct pcnet32_private *lp = dev->priv;
-    int i;
-
-    lp->tx_full = 0;
-    lp->cur_rx = lp->cur_tx = 0;
-    lp->dirty_rx = lp->dirty_tx = 0;
-
-    for (i = 0; i < lp->rx_ring_size; i++) {
-	struct sk_buff *rx_skbuff = lp->rx_skbuff[i];
-	if (rx_skbuff == NULL) {
-	    if (!(rx_skbuff = lp->rx_skbuff[i] = dev_alloc_skb (PKT_BUF_SZ))) {
-		/* there is not much, we can do at this point */
-		if (pcnet32_debug & NETIF_MSG_DRV)
-		    printk(KERN_ERR "%s: pcnet32_init_ring dev_alloc_skb failed.\n",
-			    dev->name);
-		return -1;
-	    }
-	    skb_reserve (rx_skbuff, 2);
-	}
-
-	rmb();
-	if (lp->rx_dma_addr[i] == 0)
-	    lp->rx_dma_addr[i] = pci_map_single(lp->pci_dev, rx_skbuff->data,
-		    PKT_BUF_SZ-2, PCI_DMA_FROMDEVICE);
-	lp->rx_ring[i].base = (u32)le32_to_cpu(lp->rx_dma_addr[i]);
-	lp->rx_ring[i].buf_length = le16_to_cpu(2-PKT_BUF_SZ);
-	wmb();	/* Make sure owner changes after all others are visible */
-	lp->rx_ring[i].status = le16_to_cpu(0x8000);
-    }
-    /* The Tx buffer address is filled in as needed, but we do need to clear
-     * the upper ownership bit. */
-    for (i = 0; i < lp->tx_ring_size; i++) {
-	lp->tx_ring[i].status = 0;	/* CPU owns buffer */
-	wmb();	/* Make sure adapter sees owner change */
-	lp->tx_ring[i].base = 0;
-	lp->tx_dma_addr[i] = 0;
-    }
-
-    lp->init_block.tlen_rlen = le16_to_cpu(lp->tx_len_bits | lp->rx_len_bits);
-    for (i = 0; i < 6; i++)
-	lp->init_block.phys_addr[i] = dev->dev_addr[i];
-    lp->init_block.rx_ring = (u32)le32_to_cpu(lp->rx_ring_dma_addr);
-    lp->init_block.tx_ring = (u32)le32_to_cpu(lp->tx_ring_dma_addr);
-    wmb();	/* Make sure all changes are visible */
-    return 0;
+	struct pcnet32_private *lp = dev->priv;
+	int i;
+
+	lp->tx_full = 0;
+	lp->cur_rx = lp->cur_tx = 0;
+	lp->dirty_rx = lp->dirty_tx = 0;
+
+	for (i = 0; i < lp->rx_ring_size; i++) {
+		struct sk_buff *rx_skbuff = lp->rx_skbuff[i];
+		if (rx_skbuff == NULL) {
+			if (!
+			    (rx_skbuff = lp->rx_skbuff[i] =
+			     dev_alloc_skb(PKT_BUF_SZ))) {
+				/* there is not much, we can do at this point */
+				if (pcnet32_debug & NETIF_MSG_DRV)
+					printk(KERN_ERR
+					       "%s: pcnet32_init_ring dev_alloc_skb failed.\n",
+					       dev->name);
+				return -1;
+			}
+			skb_reserve(rx_skbuff, 2);
+		}
+
+		rmb();
+		if (lp->rx_dma_addr[i] == 0)
+			lp->rx_dma_addr[i] =
+			    pci_map_single(lp->pci_dev, rx_skbuff->data,
+					   PKT_BUF_SZ - 2, PCI_DMA_FROMDEVICE);
+		lp->rx_ring[i].base = (u32) le32_to_cpu(lp->rx_dma_addr[i]);
+		lp->rx_ring[i].buf_length = le16_to_cpu(2 - PKT_BUF_SZ);
+		wmb();		/* Make sure owner changes after all others are visible */
+		lp->rx_ring[i].status = le16_to_cpu(0x8000);
+	}
+	/* The Tx buffer address is filled in as needed, but we do need to clear
+	 * the upper ownership bit. */
+	for (i = 0; i < lp->tx_ring_size; i++) {
+		lp->tx_ring[i].status = 0;	/* CPU owns buffer */
+		wmb();		/* Make sure adapter sees owner change */
+		lp->tx_ring[i].base = 0;
+		lp->tx_dma_addr[i] = 0;
+	}
+
+	lp->init_block.tlen_rlen =
+	    le16_to_cpu(lp->tx_len_bits | lp->rx_len_bits);
+	for (i = 0; i < 6; i++)
+		lp->init_block.phys_addr[i] = dev->dev_addr[i];
+	lp->init_block.rx_ring = (u32) le32_to_cpu(lp->rx_ring_dma_addr);
+	lp->init_block.tx_ring = (u32) le32_to_cpu(lp->tx_ring_dma_addr);
+	wmb();			/* Make sure all changes are visible */
+	return 0;
 }
 
 /* the pcnet32 has been issued a stop or reset.  Wait for the stop bit
  * then flush the pending transmit operations, re-initialize the ring,
  * and tell the chip to initialize.
  */
-static void
-pcnet32_restart(struct net_device *dev, unsigned int csr0_bits)
+static void pcnet32_restart(struct net_device *dev, unsigned int csr0_bits)
 {
-    struct pcnet32_private *lp = dev->priv;
-    unsigned long ioaddr = dev->base_addr;
-    int i;
+	struct pcnet32_private *lp = dev->priv;
+	unsigned long ioaddr = dev->base_addr;
+	int i;
 
-    /* wait for stop */
-    for (i=0; i<100; i++)
-	if (lp->a.read_csr(ioaddr, 0) & 0x0004)
-	   break;
+	/* wait for stop */
+	for (i = 0; i < 100; i++)
+		if (lp->a.read_csr(ioaddr, 0) & 0x0004)
+			break;
 
-    if (i >= 100 && netif_msg_drv(lp))
-	printk(KERN_ERR "%s: pcnet32_restart timed out waiting for stop.\n",
-		dev->name);
+	if (i >= 100 && netif_msg_drv(lp))
+		printk(KERN_ERR
+		       "%s: pcnet32_restart timed out waiting for stop.\n",
+		       dev->name);
 
-    pcnet32_purge_tx_ring(dev);
-    if (pcnet32_init_ring(dev))
-	return;
+	pcnet32_purge_tx_ring(dev);
+	if (pcnet32_init_ring(dev))
+		return;
 
-    /* ReInit Ring */
-    lp->a.write_csr (ioaddr, 0, 1);
-    i = 0;
-    while (i++ < 1000)
-	if (lp->a.read_csr (ioaddr, 0) & 0x0100)
-	    break;
+	/* ReInit Ring */
+	lp->a.write_csr(ioaddr, 0, 1);
+	i = 0;
+	while (i++ < 1000)
+		if (lp->a.read_csr(ioaddr, 0) & 0x0100)
+			break;
 
-    lp->a.write_csr (ioaddr, 0, csr0_bits);
+	lp->a.write_csr(ioaddr, 0, csr0_bits);
 }
 
-
-static void
-pcnet32_tx_timeout (struct net_device *dev)
+static void pcnet32_tx_timeout(struct net_device *dev)
 {
-    struct pcnet32_private *lp = dev->priv;
-    unsigned long ioaddr = dev->base_addr, flags;
-
-    spin_lock_irqsave(&lp->lock, flags);
-    /* Transmitter timeout, serious problems. */
-    if (pcnet32_debug & NETIF_MSG_DRV)
-	printk(KERN_ERR "%s: transmit timed out, status %4.4x, resetting.\n",
-		dev->name, lp->a.read_csr(ioaddr, 0));
-    lp->a.write_csr (ioaddr, 0, 0x0004);
-    lp->stats.tx_errors++;
-    if (netif_msg_tx_err(lp)) {
-	int i;
-	printk(KERN_DEBUG " Ring data dump: dirty_tx %d cur_tx %d%s cur_rx %d.",
-	   lp->dirty_tx, lp->cur_tx, lp->tx_full ? " (full)" : "",
-	   lp->cur_rx);
-	for (i = 0 ; i < lp->rx_ring_size; i++)
-	printk("%s %08x %04x %08x %04x", i & 1 ? "" : "\n ",
-	       le32_to_cpu(lp->rx_ring[i].base),
-	       (-le16_to_cpu(lp->rx_ring[i].buf_length)) & 0xffff,
-	       le32_to_cpu(lp->rx_ring[i].msg_length),
-	       le16_to_cpu(lp->rx_ring[i].status));
-	for (i = 0 ; i < lp->tx_ring_size; i++)
-	printk("%s %08x %04x %08x %04x", i & 1 ? "" : "\n ",
-	       le32_to_cpu(lp->tx_ring[i].base),
-	       (-le16_to_cpu(lp->tx_ring[i].length)) & 0xffff,
-	       le32_to_cpu(lp->tx_ring[i].misc),
-	       le16_to_cpu(lp->tx_ring[i].status));
-	printk("\n");
-    }
-    pcnet32_restart(dev, 0x0042);
-
-    dev->trans_start = jiffies;
-    netif_wake_queue(dev);
-
-    spin_unlock_irqrestore(&lp->lock, flags);
-}
+	struct pcnet32_private *lp = dev->priv;
+	unsigned long ioaddr = dev->base_addr, flags;
+
+	spin_lock_irqsave(&lp->lock, flags);
+	/* Transmitter timeout, serious problems. */
+	if (pcnet32_debug & NETIF_MSG_DRV)
+		printk(KERN_ERR
+		       "%s: transmit timed out, status %4.4x, resetting.\n",
+		       dev->name, lp->a.read_csr(ioaddr, 0));
+	lp->a.write_csr(ioaddr, 0, 0x0004);
+	lp->stats.tx_errors++;
+	if (netif_msg_tx_err(lp)) {
+		int i;
+		printk(KERN_DEBUG
+		       " Ring data dump: dirty_tx %d cur_tx %d%s cur_rx %d.",
+		       lp->dirty_tx, lp->cur_tx, lp->tx_full ? " (full)" : "",
+		       lp->cur_rx);
+		for (i = 0; i < lp->rx_ring_size; i++)
+			printk("%s %08x %04x %08x %04x", i & 1 ? "" : "\n ",
+			       le32_to_cpu(lp->rx_ring[i].base),
+			       (-le16_to_cpu(lp->rx_ring[i].buf_length)) &
+			       0xffff, le32_to_cpu(lp->rx_ring[i].msg_length),
+			       le16_to_cpu(lp->rx_ring[i].status));
+		for (i = 0; i < lp->tx_ring_size; i++)
+			printk("%s %08x %04x %08x %04x", i & 1 ? "" : "\n ",
+			       le32_to_cpu(lp->tx_ring[i].base),
+			       (-le16_to_cpu(lp->tx_ring[i].length)) & 0xffff,
+			       le32_to_cpu(lp->tx_ring[i].misc),
+			       le16_to_cpu(lp->tx_ring[i].status));
+		printk("\n");
+	}
+	pcnet32_restart(dev, 0x0042);
+
+	dev->trans_start = jiffies;
+	netif_wake_queue(dev);
 
+	spin_unlock_irqrestore(&lp->lock, flags);
+}
 
-static int
-pcnet32_start_xmit(struct sk_buff *skb, struct net_device *dev)
+static int pcnet32_start_xmit(struct sk_buff *skb, struct net_device *dev)
 {
-    struct pcnet32_private *lp = dev->priv;
-    unsigned long ioaddr = dev->base_addr;
-    u16 status;
-    int entry;
-    unsigned long flags;
+	struct pcnet32_private *lp = dev->priv;
+	unsigned long ioaddr = dev->base_addr;
+	u16 status;
+	int entry;
+	unsigned long flags;
 
-    spin_lock_irqsave(&lp->lock, flags);
+	spin_lock_irqsave(&lp->lock, flags);
 
-    if (netif_msg_tx_queued(lp)) {
-	printk(KERN_DEBUG "%s: pcnet32_start_xmit() called, csr0 %4.4x.\n",
-	       dev->name, lp->a.read_csr(ioaddr, 0));
-    }
+	if (netif_msg_tx_queued(lp)) {
+		printk(KERN_DEBUG
+		       "%s: pcnet32_start_xmit() called, csr0 %4.4x.\n",
+		       dev->name, lp->a.read_csr(ioaddr, 0));
+	}
 
-    /* Default status -- will not enable Successful-TxDone
-     * interrupt when that option is available to us.
-     */
-    status = 0x8300;
+	/* Default status -- will not enable Successful-TxDone
+	 * interrupt when that option is available to us.
+	 */
+	status = 0x8300;
 
-    /* Fill in a Tx ring entry */
+	/* Fill in a Tx ring entry */
 
-    /* Mask to ring buffer boundary. */
-    entry = lp->cur_tx & lp->tx_mod_mask;
+	/* Mask to ring buffer boundary. */
+	entry = lp->cur_tx & lp->tx_mod_mask;
 
-    /* Caution: the write order is important here, set the status
-     * with the "ownership" bits last. */
+	/* Caution: the write order is important here, set the status
+	 * with the "ownership" bits last. */
 
-    lp->tx_ring[entry].length = le16_to_cpu(-skb->len);
+	lp->tx_ring[entry].length = le16_to_cpu(-skb->len);
 
-    lp->tx_ring[entry].misc = 0x00000000;
+	lp->tx_ring[entry].misc = 0x00000000;
 
-    lp->tx_skbuff[entry] = skb;
-    lp->tx_dma_addr[entry] = pci_map_single(lp->pci_dev, skb->data, skb->len,
-	    PCI_DMA_TODEVICE);
-    lp->tx_ring[entry].base = (u32)le32_to_cpu(lp->tx_dma_addr[entry]);
-    wmb(); /* Make sure owner changes after all others are visible */
-    lp->tx_ring[entry].status = le16_to_cpu(status);
+	lp->tx_skbuff[entry] = skb;
+	lp->tx_dma_addr[entry] =
+	    pci_map_single(lp->pci_dev, skb->data, skb->len, PCI_DMA_TODEVICE);
+	lp->tx_ring[entry].base = (u32) le32_to_cpu(lp->tx_dma_addr[entry]);
+	wmb();			/* Make sure owner changes after all others are visible */
+	lp->tx_ring[entry].status = le16_to_cpu(status);
 
-    lp->cur_tx++;
-    lp->stats.tx_bytes += skb->len;
+	lp->cur_tx++;
+	lp->stats.tx_bytes += skb->len;
 
-    /* Trigger an immediate send poll. */
-    lp->a.write_csr (ioaddr, 0, 0x0048);
+	/* Trigger an immediate send poll. */
+	lp->a.write_csr(ioaddr, 0, 0x0048);
 
-    dev->trans_start = jiffies;
+	dev->trans_start = jiffies;
 
-    if (lp->tx_ring[(entry+1) & lp->tx_mod_mask].base != 0) {
-	lp->tx_full = 1;
-	netif_stop_queue(dev);
-    }
-    spin_unlock_irqrestore(&lp->lock, flags);
-    return 0;
+	if (lp->tx_ring[(entry + 1) & lp->tx_mod_mask].base != 0) {
+		lp->tx_full = 1;
+		netif_stop_queue(dev);
+	}
+	spin_unlock_irqrestore(&lp->lock, flags);
+	return 0;
 }
 
 /* The PCNET32 interrupt handler. */
 static irqreturn_t
-pcnet32_interrupt(int irq, void *dev_id, struct pt_regs * regs)
+pcnet32_interrupt(int irq, void *dev_id, struct pt_regs *regs)
 {
-    struct net_device *dev = dev_id;
-    struct pcnet32_private *lp;
-    unsigned long ioaddr;
-    u16 csr0,rap;
-    int boguscnt =  max_interrupt_work;
-    int must_restart;
-
-    if (!dev) {
-	if (pcnet32_debug & NETIF_MSG_INTR)
-	    printk (KERN_DEBUG "%s(): irq %d for unknown device\n",
-		__FUNCTION__, irq);
-	return IRQ_NONE;
-    }
-
-    ioaddr = dev->base_addr;
-    lp = dev->priv;
-
-    spin_lock(&lp->lock);
-
-    rap = lp->a.read_rap(ioaddr);
-    while ((csr0 = lp->a.read_csr (ioaddr, 0)) & 0x8f00 && --boguscnt >= 0) {
-	if (csr0 == 0xffff) {
-	    break;			/* PCMCIA remove happened */
+	struct net_device *dev = dev_id;
+	struct pcnet32_private *lp;
+	unsigned long ioaddr;
+	u16 csr0, rap;
+	int boguscnt = max_interrupt_work;
+	int must_restart;
+
+	if (!dev) {
+		if (pcnet32_debug & NETIF_MSG_INTR)
+			printk(KERN_DEBUG "%s(): irq %d for unknown device\n",
+			       __FUNCTION__, irq);
+		return IRQ_NONE;
 	}
-	/* Acknowledge all of the current interrupt sources ASAP. */
-	lp->a.write_csr (ioaddr, 0, csr0 & ~0x004f);
 
-	must_restart = 0;
+	ioaddr = dev->base_addr;
+	lp = dev->priv;
 
-	if (netif_msg_intr(lp))
-	    printk(KERN_DEBUG "%s: interrupt  csr0=%#2.2x new csr=%#2.2x.\n",
-		   dev->name, csr0, lp->a.read_csr (ioaddr, 0));
-
-	if (csr0 & 0x0400)		/* Rx interrupt */
-	    pcnet32_rx(dev);
-
-	if (csr0 & 0x0200) {		/* Tx-done interrupt */
-	    unsigned int dirty_tx = lp->dirty_tx;
-	    int delta;
-
-	    while (dirty_tx != lp->cur_tx) {
-		int entry = dirty_tx & lp->tx_mod_mask;
-		int status = (short)le16_to_cpu(lp->tx_ring[entry].status);
-
-		if (status < 0)
-		    break;		/* It still hasn't been Txed */
-
-		lp->tx_ring[entry].base = 0;
-
-		if (status & 0x4000) {
-		    /* There was an major error, log it. */
-		    int err_status = le32_to_cpu(lp->tx_ring[entry].misc);
-		    lp->stats.tx_errors++;
-		    if (netif_msg_tx_err(lp))
-			printk(KERN_ERR "%s: Tx error status=%04x err_status=%08x\n",
-				dev->name, status, err_status);
-		    if (err_status & 0x04000000) lp->stats.tx_aborted_errors++;
-		    if (err_status & 0x08000000) lp->stats.tx_carrier_errors++;
-		    if (err_status & 0x10000000) lp->stats.tx_window_errors++;
+	spin_lock(&lp->lock);
+
+	rap = lp->a.read_rap(ioaddr);
+	while ((csr0 = lp->a.read_csr(ioaddr, 0)) & 0x8f00 && --boguscnt >= 0) {
+		if (csr0 == 0xffff) {
+			break;	/* PCMCIA remove happened */
+		}
+		/* Acknowledge all of the current interrupt sources ASAP. */
+		lp->a.write_csr(ioaddr, 0, csr0 & ~0x004f);
+
+		must_restart = 0;
+
+		if (netif_msg_intr(lp))
+			printk(KERN_DEBUG
+			       "%s: interrupt  csr0=%#2.2x new csr=%#2.2x.\n",
+			       dev->name, csr0, lp->a.read_csr(ioaddr, 0));
+
+		if (csr0 & 0x0400)	/* Rx interrupt */
+			pcnet32_rx(dev);
+
+		if (csr0 & 0x0200) {	/* Tx-done interrupt */
+			unsigned int dirty_tx = lp->dirty_tx;
+			int delta;
+
+			while (dirty_tx != lp->cur_tx) {
+				int entry = dirty_tx & lp->tx_mod_mask;
+				int status =
+				    (short)le16_to_cpu(lp->tx_ring[entry].
+						       status);
+
+				if (status < 0)
+					break;	/* It still hasn't been Txed */
+
+				lp->tx_ring[entry].base = 0;
+
+				if (status & 0x4000) {
+					/* There was an major error, log it. */
+					int err_status =
+					    le32_to_cpu(lp->tx_ring[entry].
+							misc);
+					lp->stats.tx_errors++;
+					if (netif_msg_tx_err(lp))
+						printk(KERN_ERR
+						       "%s: Tx error status=%04x err_status=%08x\n",
+						       dev->name, status,
+						       err_status);
+					if (err_status & 0x04000000)
+						lp->stats.tx_aborted_errors++;
+					if (err_status & 0x08000000)
+						lp->stats.tx_carrier_errors++;
+					if (err_status & 0x10000000)
+						lp->stats.tx_window_errors++;
 #ifndef DO_DXSUFLO
-		    if (err_status & 0x40000000) {
-			lp->stats.tx_fifo_errors++;
-			/* Ackk!  On FIFO errors the Tx unit is turned off! */
-			/* Remove this verbosity later! */
-			if (netif_msg_tx_err(lp))
-			    printk(KERN_ERR "%s: Tx FIFO error! CSR0=%4.4x\n",
-				    dev->name, csr0);
-			must_restart = 1;
-		    }
+					if (err_status & 0x40000000) {
+						lp->stats.tx_fifo_errors++;
+						/* Ackk!  On FIFO errors the Tx unit is turned off! */
+						/* Remove this verbosity later! */
+						if (netif_msg_tx_err(lp))
+							printk(KERN_ERR
+							       "%s: Tx FIFO error! CSR0=%4.4x\n",
+							       dev->name, csr0);
+						must_restart = 1;
+					}
 #else
-		    if (err_status & 0x40000000) {
-			lp->stats.tx_fifo_errors++;
-			if (! lp->dxsuflo) {  /* If controller doesn't recover ... */
-			    /* Ackk!  On FIFO errors the Tx unit is turned off! */
-			    /* Remove this verbosity later! */
-			    if (netif_msg_tx_err(lp))
-				printk(KERN_ERR "%s: Tx FIFO error! CSR0=%4.4x\n",
-					dev->name, csr0);
-			    must_restart = 1;
-			}
-		    }
+					if (err_status & 0x40000000) {
+						lp->stats.tx_fifo_errors++;
+						if (!lp->dxsuflo) {	/* If controller doesn't recover ... */
+							/* Ackk!  On FIFO errors the Tx unit is turned off! */
+							/* Remove this verbosity later! */
+							if (netif_msg_tx_err
+							    (lp))
+								printk(KERN_ERR
+								       "%s: Tx FIFO error! CSR0=%4.4x\n",
+								       dev->
+								       name,
+								       csr0);
+							must_restart = 1;
+						}
+					}
 #endif
-		} else {
-		    if (status & 0x1800)
-			lp->stats.collisions++;
-		    lp->stats.tx_packets++;
+				} else {
+					if (status & 0x1800)
+						lp->stats.collisions++;
+					lp->stats.tx_packets++;
+				}
+
+				/* We must free the original skb */
+				if (lp->tx_skbuff[entry]) {
+					pci_unmap_single(lp->pci_dev,
+							 lp->tx_dma_addr[entry],
+							 lp->tx_skbuff[entry]->
+							 len, PCI_DMA_TODEVICE);
+					dev_kfree_skb_irq(lp->tx_skbuff[entry]);
+					lp->tx_skbuff[entry] = NULL;
+					lp->tx_dma_addr[entry] = 0;
+				}
+				dirty_tx++;
+			}
+
+			delta =
+			    (lp->cur_tx - dirty_tx) & (lp->tx_mod_mask +
+						       lp->tx_ring_size);
+			if (delta > lp->tx_ring_size) {
+				if (netif_msg_drv(lp))
+					printk(KERN_ERR
+					       "%s: out-of-sync dirty pointer, %d vs. %d, full=%d.\n",
+					       dev->name, dirty_tx, lp->cur_tx,
+					       lp->tx_full);
+				dirty_tx += lp->tx_ring_size;
+				delta -= lp->tx_ring_size;
+			}
+
+			if (lp->tx_full &&
+			    netif_queue_stopped(dev) &&
+			    delta < lp->tx_ring_size - 2) {
+				/* The ring is no longer full, clear tbusy. */
+				lp->tx_full = 0;
+				netif_wake_queue(dev);
+			}
+			lp->dirty_tx = dirty_tx;
+		}
+
+		/* Log misc errors. */
+		if (csr0 & 0x4000)
+			lp->stats.tx_errors++;	/* Tx babble. */
+		if (csr0 & 0x1000) {
+			/*
+			 * this happens when our receive ring is full. This shouldn't
+			 * be a problem as we will see normal rx interrupts for the frames
+			 * in the receive ring. But there are some PCI chipsets (I can
+			 * reproduce this on SP3G with Intel saturn chipset) which have
+			 * sometimes problems and will fill up the receive ring with
+			 * error descriptors. In this situation we don't get a rx
+			 * interrupt, but a missed frame interrupt sooner or later.
+			 * So we try to clean up our receive ring here.
+			 */
+			pcnet32_rx(dev);
+			lp->stats.rx_errors++;	/* Missed a Rx frame. */
+		}
+		if (csr0 & 0x0800) {
+			if (netif_msg_drv(lp))
+				printk(KERN_ERR
+				       "%s: Bus master arbitration failure, status %4.4x.\n",
+				       dev->name, csr0);
+			/* unlike for the lance, there is no restart needed */
 		}
 
-		/* We must free the original skb */
-		if (lp->tx_skbuff[entry]) {
-		    pci_unmap_single(lp->pci_dev, lp->tx_dma_addr[entry],
-			lp->tx_skbuff[entry]->len, PCI_DMA_TODEVICE);
-		    dev_kfree_skb_irq(lp->tx_skbuff[entry]);
-		    lp->tx_skbuff[entry] = NULL;
-		    lp->tx_dma_addr[entry] = 0;
+		if (must_restart) {
+			/* reset the chip to clear the error condition, then restart */
+			lp->a.reset(ioaddr);
+			lp->a.write_csr(ioaddr, 4, 0x0915);
+			pcnet32_restart(dev, 0x0002);
+			netif_wake_queue(dev);
 		}
-		dirty_tx++;
-	    }
-
-	    delta = (lp->cur_tx - dirty_tx) & (lp->tx_mod_mask + lp->tx_ring_size);
-	    if (delta > lp->tx_ring_size) {
-		if (netif_msg_drv(lp))
-		    printk(KERN_ERR "%s: out-of-sync dirty pointer, %d vs. %d, full=%d.\n",
-			    dev->name, dirty_tx, lp->cur_tx, lp->tx_full);
-		dirty_tx += lp->tx_ring_size;
-		delta -= lp->tx_ring_size;
-	    }
-
-	    if (lp->tx_full &&
-		netif_queue_stopped(dev) &&
-		delta < lp->tx_ring_size - 2) {
-		/* The ring is no longer full, clear tbusy. */
-		lp->tx_full = 0;
-		netif_wake_queue (dev);
-	    }
-	    lp->dirty_tx = dirty_tx;
-	}
-
-	/* Log misc errors. */
-	if (csr0 & 0x4000) lp->stats.tx_errors++; /* Tx babble. */
-	if (csr0 & 0x1000) {
-	    /*
-	     * this happens when our receive ring is full. This shouldn't
-	     * be a problem as we will see normal rx interrupts for the frames
-	     * in the receive ring. But there are some PCI chipsets (I can
-	     * reproduce this on SP3G with Intel saturn chipset) which have
-	     * sometimes problems and will fill up the receive ring with
-	     * error descriptors. In this situation we don't get a rx
-	     * interrupt, but a missed frame interrupt sooner or later.
-	     * So we try to clean up our receive ring here.
-	     */
-	    pcnet32_rx(dev);
-	    lp->stats.rx_errors++; /* Missed a Rx frame. */
-	}
-	if (csr0 & 0x0800) {
-	    if (netif_msg_drv(lp))
-		printk(KERN_ERR "%s: Bus master arbitration failure, status %4.4x.\n",
-			dev->name, csr0);
-	    /* unlike for the lance, there is no restart needed */
-	}
-
-	if (must_restart) {
-	    /* reset the chip to clear the error condition, then restart */
-	    lp->a.reset(ioaddr);
-	    lp->a.write_csr(ioaddr, 4, 0x0915);
-	    pcnet32_restart(dev, 0x0002);
-	    netif_wake_queue(dev);
-	}
-    }
-
-    /* Set interrupt enable. */
-    lp->a.write_csr (ioaddr, 0, 0x0040);
-    lp->a.write_rap (ioaddr,rap);
-
-    if (netif_msg_intr(lp))
-	printk(KERN_DEBUG "%s: exiting interrupt, csr0=%#4.4x.\n",
-		dev->name, lp->a.read_csr (ioaddr, 0));
-
-    spin_unlock(&lp->lock);
-
-    return IRQ_HANDLED;
+	}
+
+	/* Set interrupt enable. */
+	lp->a.write_csr(ioaddr, 0, 0x0040);
+	lp->a.write_rap(ioaddr, rap);
+
+	if (netif_msg_intr(lp))
+		printk(KERN_DEBUG "%s: exiting interrupt, csr0=%#4.4x.\n",
+		       dev->name, lp->a.read_csr(ioaddr, 0));
+
+	spin_unlock(&lp->lock);
+
+	return IRQ_HANDLED;
 }
 
-static int
-pcnet32_rx(struct net_device *dev)
+static int pcnet32_rx(struct net_device *dev)
 {
-    struct pcnet32_private *lp = dev->priv;
-    int entry = lp->cur_rx & lp->rx_mod_mask;
-    int boguscnt = lp->rx_ring_size / 2;
-
-    /* If we own the next entry, it's a new packet. Send it up. */
-    while ((short)le16_to_cpu(lp->rx_ring[entry].status) >= 0) {
-	int status = (short)le16_to_cpu(lp->rx_ring[entry].status) >> 8;
-
-	if (status != 0x03) {			/* There was an error. */
-	    /*
-	     * There is a tricky error noted by John Murphy,
-	     * <murf@perftech.com> to Russ Nelson: Even with full-sized
-	     * buffers it's possible for a jabber packet to use two
-	     * buffers, with only the last correctly noting the error.
-	     */
-	    if (status & 0x01)	/* Only count a general error at the */
-		lp->stats.rx_errors++; /* end of a packet.*/
-	    if (status & 0x20) lp->stats.rx_frame_errors++;
-	    if (status & 0x10) lp->stats.rx_over_errors++;
-	    if (status & 0x08) lp->stats.rx_crc_errors++;
-	    if (status & 0x04) lp->stats.rx_fifo_errors++;
-	    lp->rx_ring[entry].status &= le16_to_cpu(0x03ff);
-	} else {
-	    /* Malloc up new buffer, compatible with net-2e. */
-	    short pkt_len = (le32_to_cpu(lp->rx_ring[entry].msg_length) & 0xfff)-4;
-	    struct sk_buff *skb;
-
-	    /* Discard oversize frames. */
-	    if (unlikely(pkt_len > PKT_BUF_SZ - 2)) {
-		if (netif_msg_drv(lp))
-		    printk(KERN_ERR "%s: Impossible packet size %d!\n",
-			    dev->name, pkt_len);
-		lp->stats.rx_errors++;
-	    } else if (pkt_len < 60) {
-		if (netif_msg_rx_err(lp))
-		    printk(KERN_ERR "%s: Runt packet!\n", dev->name);
-		lp->stats.rx_errors++;
-	    } else {
-		int rx_in_place = 0;
-
-		if (pkt_len > rx_copybreak) {
-		    struct sk_buff *newskb;
-
-		    if ((newskb = dev_alloc_skb(PKT_BUF_SZ))) {
-			skb_reserve (newskb, 2);
-			skb = lp->rx_skbuff[entry];
-			pci_unmap_single(lp->pci_dev, lp->rx_dma_addr[entry],
-				PKT_BUF_SZ-2, PCI_DMA_FROMDEVICE);
-			skb_put (skb, pkt_len);
-			lp->rx_skbuff[entry] = newskb;
-			newskb->dev = dev;
-			lp->rx_dma_addr[entry] =
-			    pci_map_single(lp->pci_dev, newskb->data,
-				    PKT_BUF_SZ-2, PCI_DMA_FROMDEVICE);
-			lp->rx_ring[entry].base = le32_to_cpu(lp->rx_dma_addr[entry]);
-			rx_in_place = 1;
-		    } else
-			skb = NULL;
+	struct pcnet32_private *lp = dev->priv;
+	int entry = lp->cur_rx & lp->rx_mod_mask;
+	int boguscnt = lp->rx_ring_size / 2;
+
+	/* If we own the next entry, it's a new packet. Send it up. */
+	while ((short)le16_to_cpu(lp->rx_ring[entry].status) >= 0) {
+		int status = (short)le16_to_cpu(lp->rx_ring[entry].status) >> 8;
+
+		if (status != 0x03) {	/* There was an error. */
+			/*
+			 * There is a tricky error noted by John Murphy,
+			 * <murf@perftech.com> to Russ Nelson: Even with full-sized
+			 * buffers it's possible for a jabber packet to use two
+			 * buffers, with only the last correctly noting the error.
+			 */
+			if (status & 0x01)	/* Only count a general error at the */
+				lp->stats.rx_errors++;	/* end of a packet. */
+			if (status & 0x20)
+				lp->stats.rx_frame_errors++;
+			if (status & 0x10)
+				lp->stats.rx_over_errors++;
+			if (status & 0x08)
+				lp->stats.rx_crc_errors++;
+			if (status & 0x04)
+				lp->stats.rx_fifo_errors++;
+			lp->rx_ring[entry].status &= le16_to_cpu(0x03ff);
 		} else {
-		    skb = dev_alloc_skb(pkt_len+2);
-		}
-
-		if (skb == NULL) {
-		    int i;
-		    if (netif_msg_drv(lp))
-			printk(KERN_ERR "%s: Memory squeeze, deferring packet.\n",
-				dev->name);
-		    for (i = 0; i < lp->rx_ring_size; i++)
-			if ((short)le16_to_cpu(lp->rx_ring[(entry+i)
-				    & lp->rx_mod_mask].status) < 0)
-			    break;
-
-		    if (i > lp->rx_ring_size -2) {
-			lp->stats.rx_dropped++;
-			lp->rx_ring[entry].status |= le16_to_cpu(0x8000);
-			wmb();	/* Make sure adapter sees owner change */
-			lp->cur_rx++;
-		    }
-		    break;
-		}
-		skb->dev = dev;
-		if (!rx_in_place) {
-		    skb_reserve(skb,2); /* 16 byte align */
-		    skb_put(skb,pkt_len);	/* Make room */
-		    pci_dma_sync_single_for_cpu(lp->pci_dev,
-						lp->rx_dma_addr[entry],
-						PKT_BUF_SZ-2,
-						PCI_DMA_FROMDEVICE);
-		    eth_copy_and_sum(skb,
-			    (unsigned char *)(lp->rx_skbuff[entry]->data),
-			    pkt_len,0);
-		    pci_dma_sync_single_for_device(lp->pci_dev,
-						   lp->rx_dma_addr[entry],
-						   PKT_BUF_SZ-2,
-						   PCI_DMA_FROMDEVICE);
+			/* Malloc up new buffer, compatible with net-2e. */
+			short pkt_len =
+			    (le32_to_cpu(lp->rx_ring[entry].msg_length) & 0xfff)
+			    - 4;
+			struct sk_buff *skb;
+
+			/* Discard oversize frames. */
+			if (unlikely(pkt_len > PKT_BUF_SZ - 2)) {
+				if (netif_msg_drv(lp))
+					printk(KERN_ERR
+					       "%s: Impossible packet size %d!\n",
+					       dev->name, pkt_len);
+				lp->stats.rx_errors++;
+			} else if (pkt_len < 60) {
+				if (netif_msg_rx_err(lp))
+					printk(KERN_ERR "%s: Runt packet!\n",
+					       dev->name);
+				lp->stats.rx_errors++;
+			} else {
+				int rx_in_place = 0;
+
+				if (pkt_len > rx_copybreak) {
+					struct sk_buff *newskb;
+
+					if ((newskb =
+					     dev_alloc_skb(PKT_BUF_SZ))) {
+						skb_reserve(newskb, 2);
+						skb = lp->rx_skbuff[entry];
+						pci_unmap_single(lp->pci_dev,
+								 lp->
+								 rx_dma_addr
+								 [entry],
+								 PKT_BUF_SZ - 2,
+								 PCI_DMA_FROMDEVICE);
+						skb_put(skb, pkt_len);
+						lp->rx_skbuff[entry] = newskb;
+						newskb->dev = dev;
+						lp->rx_dma_addr[entry] =
+						    pci_map_single(lp->pci_dev,
+								   newskb->data,
+								   PKT_BUF_SZ -
+								   2,
+								   PCI_DMA_FROMDEVICE);
+						lp->rx_ring[entry].base =
+						    le32_to_cpu(lp->
+								rx_dma_addr
+								[entry]);
+						rx_in_place = 1;
+					} else
+						skb = NULL;
+				} else {
+					skb = dev_alloc_skb(pkt_len + 2);
+				}
+
+				if (skb == NULL) {
+					int i;
+					if (netif_msg_drv(lp))
+						printk(KERN_ERR
+						       "%s: Memory squeeze, deferring packet.\n",
+						       dev->name);
+					for (i = 0; i < lp->rx_ring_size; i++)
+						if ((short)
+						    le16_to_cpu(lp->
+								rx_ring[(entry +
+									 i)
+									& lp->
+									rx_mod_mask].
+								status) < 0)
+							break;
+
+					if (i > lp->rx_ring_size - 2) {
+						lp->stats.rx_dropped++;
+						lp->rx_ring[entry].status |=
+						    le16_to_cpu(0x8000);
+						wmb();	/* Make sure adapter sees owner change */
+						lp->cur_rx++;
+					}
+					break;
+				}
+				skb->dev = dev;
+				if (!rx_in_place) {
+					skb_reserve(skb, 2);	/* 16 byte align */
+					skb_put(skb, pkt_len);	/* Make room */
+					pci_dma_sync_single_for_cpu(lp->pci_dev,
+								    lp->
+								    rx_dma_addr
+								    [entry],
+								    PKT_BUF_SZ -
+								    2,
+								    PCI_DMA_FROMDEVICE);
+					eth_copy_and_sum(skb,
+							 (unsigned char *)(lp->
+									   rx_skbuff
+									   [entry]->
+									   data),
+							 pkt_len, 0);
+					pci_dma_sync_single_for_device(lp->
+								       pci_dev,
+								       lp->
+								       rx_dma_addr
+								       [entry],
+								       PKT_BUF_SZ
+								       - 2,
+								       PCI_DMA_FROMDEVICE);
+				}
+				lp->stats.rx_bytes += skb->len;
+				skb->protocol = eth_type_trans(skb, dev);
+				netif_rx(skb);
+				dev->last_rx = jiffies;
+				lp->stats.rx_packets++;
+			}
 		}
-		lp->stats.rx_bytes += skb->len;
-		skb->protocol=eth_type_trans(skb,dev);
-		netif_rx(skb);
-		dev->last_rx = jiffies;
-		lp->stats.rx_packets++;
-	    }
+		/*
+		 * The docs say that the buffer length isn't touched, but Andrew Boyd
+		 * of QNX reports that some revs of the 79C965 clear it.
+		 */
+		lp->rx_ring[entry].buf_length = le16_to_cpu(2 - PKT_BUF_SZ);
+		wmb();		/* Make sure owner changes after all others are visible */
+		lp->rx_ring[entry].status |= le16_to_cpu(0x8000);
+		entry = (++lp->cur_rx) & lp->rx_mod_mask;
+		if (--boguscnt <= 0)
+			break;	/* don't stay in loop forever */
 	}
-	/*
-	 * The docs say that the buffer length isn't touched, but Andrew Boyd
-	 * of QNX reports that some revs of the 79C965 clear it.
-	 */
-	lp->rx_ring[entry].buf_length = le16_to_cpu(2-PKT_BUF_SZ);
-	wmb(); /* Make sure owner changes after all others are visible */
-	lp->rx_ring[entry].status |= le16_to_cpu(0x8000);
-	entry = (++lp->cur_rx) & lp->rx_mod_mask;
-	if (--boguscnt <= 0) break;	/* don't stay in loop forever */
-    }
-
-    return 0;
+
+	return 0;
 }
 
-static int
-pcnet32_close(struct net_device *dev)
+static int pcnet32_close(struct net_device *dev)
 {
-    unsigned long ioaddr = dev->base_addr;
-    struct pcnet32_private *lp = dev->priv;
-    int i;
-    unsigned long flags;
+	unsigned long ioaddr = dev->base_addr;
+	struct pcnet32_private *lp = dev->priv;
+	int i;
+	unsigned long flags;
 
-    del_timer_sync(&lp->watchdog_timer);
+	del_timer_sync(&lp->watchdog_timer);
 
-    netif_stop_queue(dev);
+	netif_stop_queue(dev);
 
-    spin_lock_irqsave(&lp->lock, flags);
+	spin_lock_irqsave(&lp->lock, flags);
 
-    lp->stats.rx_missed_errors = lp->a.read_csr (ioaddr, 112);
+	lp->stats.rx_missed_errors = lp->a.read_csr(ioaddr, 112);
 
-    if (netif_msg_ifdown(lp))
-	printk(KERN_DEBUG "%s: Shutting down ethercard, status was %2.2x.\n",
-	       dev->name, lp->a.read_csr (ioaddr, 0));
+	if (netif_msg_ifdown(lp))
+		printk(KERN_DEBUG
+		       "%s: Shutting down ethercard, status was %2.2x.\n",
+		       dev->name, lp->a.read_csr(ioaddr, 0));
 
-    /* We stop the PCNET32 here -- it occasionally polls memory if we don't. */
-    lp->a.write_csr (ioaddr, 0, 0x0004);
+	/* We stop the PCNET32 here -- it occasionally polls memory if we don't. */
+	lp->a.write_csr(ioaddr, 0, 0x0004);
 
-    /*
-     * Switch back to 16bit mode to avoid problems with dumb
-     * DOS packet driver after a warm reboot
-     */
-    lp->a.write_bcr (ioaddr, 20, 4);
+	/*
+	 * Switch back to 16bit mode to avoid problems with dumb
+	 * DOS packet driver after a warm reboot
+	 */
+	lp->a.write_bcr(ioaddr, 20, 4);
 
-    spin_unlock_irqrestore(&lp->lock, flags);
+	spin_unlock_irqrestore(&lp->lock, flags);
 
-    free_irq(dev->irq, dev);
+	free_irq(dev->irq, dev);
 
-    spin_lock_irqsave(&lp->lock, flags);
+	spin_lock_irqsave(&lp->lock, flags);
 
-    /* free all allocated skbuffs */
-    for (i = 0; i < lp->rx_ring_size; i++) {
-	lp->rx_ring[i].status = 0;
-	wmb();		/* Make sure adapter sees owner change */
-	if (lp->rx_skbuff[i]) {
-	    pci_unmap_single(lp->pci_dev, lp->rx_dma_addr[i], PKT_BUF_SZ-2,
-		    PCI_DMA_FROMDEVICE);
-	    dev_kfree_skb(lp->rx_skbuff[i]);
+	/* free all allocated skbuffs */
+	for (i = 0; i < lp->rx_ring_size; i++) {
+		lp->rx_ring[i].status = 0;
+		wmb();		/* Make sure adapter sees owner change */
+		if (lp->rx_skbuff[i]) {
+			pci_unmap_single(lp->pci_dev, lp->rx_dma_addr[i],
+					 PKT_BUF_SZ - 2, PCI_DMA_FROMDEVICE);
+			dev_kfree_skb(lp->rx_skbuff[i]);
+		}
+		lp->rx_skbuff[i] = NULL;
+		lp->rx_dma_addr[i] = 0;
 	}
-	lp->rx_skbuff[i] = NULL;
-	lp->rx_dma_addr[i] = 0;
-    }
 
-    for (i = 0; i < lp->tx_ring_size; i++) {
-	lp->tx_ring[i].status = 0;	/* CPU owns buffer */
-	wmb();		/* Make sure adapter sees owner change */
-	if (lp->tx_skbuff[i]) {
-	    pci_unmap_single(lp->pci_dev, lp->tx_dma_addr[i],
-		    lp->tx_skbuff[i]->len, PCI_DMA_TODEVICE);
-	    dev_kfree_skb(lp->tx_skbuff[i]);
+	for (i = 0; i < lp->tx_ring_size; i++) {
+		lp->tx_ring[i].status = 0;	/* CPU owns buffer */
+		wmb();		/* Make sure adapter sees owner change */
+		if (lp->tx_skbuff[i]) {
+			pci_unmap_single(lp->pci_dev, lp->tx_dma_addr[i],
+					 lp->tx_skbuff[i]->len,
+					 PCI_DMA_TODEVICE);
+			dev_kfree_skb(lp->tx_skbuff[i]);
+		}
+		lp->tx_skbuff[i] = NULL;
+		lp->tx_dma_addr[i] = 0;
 	}
-	lp->tx_skbuff[i] = NULL;
-	lp->tx_dma_addr[i] = 0;
-    }
 
-    spin_unlock_irqrestore(&lp->lock, flags);
+	spin_unlock_irqrestore(&lp->lock, flags);
 
-    return 0;
+	return 0;
 }
 
-static struct net_device_stats *
-pcnet32_get_stats(struct net_device *dev)
+static struct net_device_stats *pcnet32_get_stats(struct net_device *dev)
 {
-    struct pcnet32_private *lp = dev->priv;
-    unsigned long ioaddr = dev->base_addr;
-    u16 saved_addr;
-    unsigned long flags;
-
-    spin_lock_irqsave(&lp->lock, flags);
-    saved_addr = lp->a.read_rap(ioaddr);
-    lp->stats.rx_missed_errors = lp->a.read_csr (ioaddr, 112);
-    lp->a.write_rap(ioaddr, saved_addr);
-    spin_unlock_irqrestore(&lp->lock, flags);
-
-    return &lp->stats;
+	struct pcnet32_private *lp = dev->priv;
+	unsigned long ioaddr = dev->base_addr;
+	u16 saved_addr;
+	unsigned long flags;
+
+	spin_lock_irqsave(&lp->lock, flags);
+	saved_addr = lp->a.read_rap(ioaddr);
+	lp->stats.rx_missed_errors = lp->a.read_csr(ioaddr, 112);
+	lp->a.write_rap(ioaddr, saved_addr);
+	spin_unlock_irqrestore(&lp->lock, flags);
+
+	return &lp->stats;
 }
 
 /* taken from the sunlance driver, which it took from the depca driver */
-static void pcnet32_load_multicast (struct net_device *dev)
+static void pcnet32_load_multicast(struct net_device *dev)
 {
-    struct pcnet32_private *lp = dev->priv;
-    volatile struct pcnet32_init_block *ib = &lp->init_block;
-    volatile u16 *mcast_table = (u16 *)&ib->filter;
-    struct dev_mc_list *dmi=dev->mc_list;
-    char *addrs;
-    int i;
-    u32 crc;
-
-    /* set all multicast bits */
-    if (dev->flags & IFF_ALLMULTI) {
-	ib->filter[0] = 0xffffffff;
-	ib->filter[1] = 0xffffffff;
+	struct pcnet32_private *lp = dev->priv;
+	volatile struct pcnet32_init_block *ib = &lp->init_block;
+	volatile u16 *mcast_table = (u16 *) & ib->filter;
+	struct dev_mc_list *dmi = dev->mc_list;
+	char *addrs;
+	int i;
+	u32 crc;
+
+	/* set all multicast bits */
+	if (dev->flags & IFF_ALLMULTI) {
+		ib->filter[0] = 0xffffffff;
+		ib->filter[1] = 0xffffffff;
+		return;
+	}
+	/* clear the multicast filter */
+	ib->filter[0] = 0;
+	ib->filter[1] = 0;
+
+	/* Add addresses */
+	for (i = 0; i < dev->mc_count; i++) {
+		addrs = dmi->dmi_addr;
+		dmi = dmi->next;
+
+		/* multicast address? */
+		if (!(*addrs & 1))
+			continue;
+
+		crc = ether_crc_le(6, addrs);
+		crc = crc >> 26;
+		mcast_table[crc >> 4] =
+		    le16_to_cpu(le16_to_cpu(mcast_table[crc >> 4]) |
+				(1 << (crc & 0xf)));
+	}
 	return;
-    }
-    /* clear the multicast filter */
-    ib->filter[0] = 0;
-    ib->filter[1] = 0;
-
-    /* Add addresses */
-    for (i = 0; i < dev->mc_count; i++) {
-	addrs = dmi->dmi_addr;
-	dmi   = dmi->next;
-
-	/* multicast address? */
-	if (!(*addrs & 1))
-	    continue;
-
-	crc = ether_crc_le(6, addrs);
-	crc = crc >> 26;
-	mcast_table [crc >> 4] = le16_to_cpu(
-		le16_to_cpu(mcast_table [crc >> 4]) | (1 << (crc & 0xf)));
-    }
-    return;
 }
 
-
 /*
  * Set or clear the multicast filter for this adaptor.
  */
 static void pcnet32_set_multicast_list(struct net_device *dev)
 {
-    unsigned long ioaddr = dev->base_addr, flags;
-    struct pcnet32_private *lp = dev->priv;
-
-    spin_lock_irqsave(&lp->lock, flags);
-    if (dev->flags&IFF_PROMISC) {
-	/* Log any net taps. */
-	if (netif_msg_hw(lp))
-	    printk(KERN_INFO "%s: Promiscuous mode enabled.\n", dev->name);
-	lp->init_block.mode = le16_to_cpu(0x8000 | (lp->options & PCNET32_PORT_PORTSEL) << 7);
-    } else {
-	lp->init_block.mode = le16_to_cpu((lp->options & PCNET32_PORT_PORTSEL) << 7);
-	pcnet32_load_multicast (dev);
-    }
-
-    lp->a.write_csr (ioaddr, 0, 0x0004); /* Temporarily stop the lance. */
-    pcnet32_restart(dev, 0x0042); /*  Resume normal operation */
-    netif_wake_queue(dev);
-
-    spin_unlock_irqrestore(&lp->lock, flags);
+	unsigned long ioaddr = dev->base_addr, flags;
+	struct pcnet32_private *lp = dev->priv;
+
+	spin_lock_irqsave(&lp->lock, flags);
+	if (dev->flags & IFF_PROMISC) {
+		/* Log any net taps. */
+		if (netif_msg_hw(lp))
+			printk(KERN_INFO "%s: Promiscuous mode enabled.\n",
+			       dev->name);
+		lp->init_block.mode =
+		    le16_to_cpu(0x8000 | (lp->options & PCNET32_PORT_PORTSEL) <<
+				7);
+	} else {
+		lp->init_block.mode =
+		    le16_to_cpu((lp->options & PCNET32_PORT_PORTSEL) << 7);
+		pcnet32_load_multicast(dev);
+	}
+
+	lp->a.write_csr(ioaddr, 0, 0x0004);	/* Temporarily stop the lance. */
+	pcnet32_restart(dev, 0x0042);	/*  Resume normal operation */
+	netif_wake_queue(dev);
+
+	spin_unlock_irqrestore(&lp->lock, flags);
 }
 
 /* This routine assumes that the lp->lock is held */
 static int mdio_read(struct net_device *dev, int phy_id, int reg_num)
 {
-    struct pcnet32_private *lp = dev->priv;
-    unsigned long ioaddr = dev->base_addr;
-    u16 val_out;
+	struct pcnet32_private *lp = dev->priv;
+	unsigned long ioaddr = dev->base_addr;
+	u16 val_out;
 
-    if (!lp->mii)
-	return 0;
+	if (!lp->mii)
+		return 0;
 
-    lp->a.write_bcr(ioaddr, 33, ((phy_id & 0x1f) << 5) | (reg_num & 0x1f));
-    val_out = lp->a.read_bcr(ioaddr, 34);
+	lp->a.write_bcr(ioaddr, 33, ((phy_id & 0x1f) << 5) | (reg_num & 0x1f));
+	val_out = lp->a.read_bcr(ioaddr, 34);
 
-    return val_out;
+	return val_out;
 }
 
 /* This routine assumes that the lp->lock is held */
 static void mdio_write(struct net_device *dev, int phy_id, int reg_num, int val)
 {
-    struct pcnet32_private *lp = dev->priv;
-    unsigned long ioaddr = dev->base_addr;
+	struct pcnet32_private *lp = dev->priv;
+	unsigned long ioaddr = dev->base_addr;
 
-    if (!lp->mii)
-	return;
+	if (!lp->mii)
+		return;
 
-    lp->a.write_bcr(ioaddr, 33, ((phy_id & 0x1f) << 5) | (reg_num & 0x1f));
-    lp->a.write_bcr(ioaddr, 34, val);
+	lp->a.write_bcr(ioaddr, 33, ((phy_id & 0x1f) << 5) | (reg_num & 0x1f));
+	lp->a.write_bcr(ioaddr, 34, val);
 }
 
 static int pcnet32_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
 {
-    struct pcnet32_private *lp = dev->priv;
-    int rc;
-    unsigned long flags;
+	struct pcnet32_private *lp = dev->priv;
+	int rc;
+	unsigned long flags;
+
+	/* SIOC[GS]MIIxxx ioctls */
+	if (lp->mii) {
+		spin_lock_irqsave(&lp->lock, flags);
+		rc = generic_mii_ioctl(&lp->mii_if, if_mii(rq), cmd, NULL);
+		spin_unlock_irqrestore(&lp->lock, flags);
+	} else {
+		rc = -EOPNOTSUPP;
+	}
+
+	return rc;
+}
+
+static int pcnet32_check_otherphy(struct net_device *dev)
+{
+	struct pcnet32_private *lp = dev->priv;
+	struct mii_if_info mii = lp->mii_if;
+	u16 bmcr;
+	int i;
 
-    /* SIOC[GS]MIIxxx ioctls */
-    if (lp->mii) {
-	spin_lock_irqsave(&lp->lock, flags);
-	rc = generic_mii_ioctl(&lp->mii_if, if_mii(rq), cmd, NULL);
-	spin_unlock_irqrestore(&lp->lock, flags);
-    } else {
-	rc = -EOPNOTSUPP;
-    }
+	for (i = 0; i < PCNET32_MAX_PHYS; i++) {
+		if (i == lp->mii_if.phy_id)
+			continue;	/* skip active phy */
+		if (lp->phymask & (1 << i)) {
+			mii.phy_id = i;
+			if (mii_link_ok(&mii)) {
+				/* found PHY with active link */
+				if (netif_msg_link(lp))
+					printk(KERN_INFO
+					       "%s: Using PHY number %d.\n",
+					       dev->name, i);
+
+				/* isolate inactive phy */
+				bmcr =
+				    mdio_read(dev, lp->mii_if.phy_id, MII_BMCR);
+				mdio_write(dev, lp->mii_if.phy_id, MII_BMCR,
+					   bmcr | BMCR_ISOLATE);
+
+				/* de-isolate new phy */
+				bmcr = mdio_read(dev, i, MII_BMCR);
+				mdio_write(dev, i, MII_BMCR,
+					   bmcr & ~BMCR_ISOLATE);
+
+				/* set new phy address */
+				lp->mii_if.phy_id = i;
+				return 1;
+			}
+		}
+	}
+	return 0;
+}
+
+/*
+ * Show the status of the media.  Similar to mii_check_media however it
+ * correctly shows the link speed for all (tested) pcnet32 variants.
+ * Devices with no mii just report link state without speed.
+ *
+ * Caller is assumed to hold and release the lp->lock.
+ */
 
-    return rc;
+static void pcnet32_check_media(struct net_device *dev, int verbose)
+{
+	struct pcnet32_private *lp = dev->priv;
+	int curr_link;
+	int prev_link = netif_carrier_ok(dev) ? 1 : 0;
+	u32 bcr9;
+
+	if (lp->mii) {
+		curr_link = mii_link_ok(&lp->mii_if);
+	} else {
+		ulong ioaddr = dev->base_addr;	/* card base I/O address */
+		curr_link = (lp->a.read_bcr(ioaddr, 4) != 0xc0);
+	}
+	if (!curr_link) {
+		if (prev_link || verbose) {
+			netif_carrier_off(dev);
+			if (netif_msg_link(lp))
+				printk(KERN_INFO "%s: link down\n", dev->name);
+		}
+		if (lp->phycount > 1) {
+			curr_link = pcnet32_check_otherphy(dev);
+			prev_link = 0;
+		}
+	} else if (verbose || !prev_link) {
+		netif_carrier_on(dev);
+		if (lp->mii) {
+			if (netif_msg_link(lp)) {
+				struct ethtool_cmd ecmd;
+				mii_ethtool_gset(&lp->mii_if, &ecmd);
+				printk(KERN_INFO
+				       "%s: link up, %sMbps, %s-duplex\n",
+				       dev->name,
+				       (ecmd.speed == SPEED_100) ? "100" : "10",
+				       (ecmd.duplex ==
+					DUPLEX_FULL) ? "full" : "half");
+			}
+			bcr9 = lp->a.read_bcr(dev->base_addr, 9);
+			if ((bcr9 & (1 << 0)) != lp->mii_if.full_duplex) {
+				if (lp->mii_if.full_duplex)
+					bcr9 |= (1 << 0);
+				else
+					bcr9 &= ~(1 << 0);
+				lp->a.write_bcr(dev->base_addr, 9, bcr9);
+			}
+		} else {
+			if (netif_msg_link(lp))
+				printk(KERN_INFO "%s: link up\n", dev->name);
+		}
+	}
 }
 
+/*
+ * Check for loss of link and link establishment.
+ * Can not use mii_check_media because it does nothing if mode is forced.
+ */
+
 static void pcnet32_watchdog(struct net_device *dev)
 {
-    struct pcnet32_private *lp = dev->priv;
-    unsigned long flags;
+	struct pcnet32_private *lp = dev->priv;
+	unsigned long flags;
 
-    /* Print the link status if it has changed */
-    if (lp->mii) {
+	/* Print the link status if it has changed */
 	spin_lock_irqsave(&lp->lock, flags);
-	mii_check_media (&lp->mii_if, netif_msg_link(lp), 0);
+	pcnet32_check_media(dev, 0);
 	spin_unlock_irqrestore(&lp->lock, flags);
-    }
 
-    mod_timer (&(lp->watchdog_timer), PCNET32_WATCHDOG_TIMEOUT);
+	mod_timer(&(lp->watchdog_timer), PCNET32_WATCHDOG_TIMEOUT);
 }
 
 static void __devexit pcnet32_remove_one(struct pci_dev *pdev)
 {
-    struct net_device *dev = pci_get_drvdata(pdev);
-
-    if (dev) {
-	struct pcnet32_private *lp = dev->priv;
-
-	unregister_netdev(dev);
-	pcnet32_free_ring(dev);
-	release_region(dev->base_addr, PCNET32_TOTAL_SIZE);
-	pci_free_consistent(lp->pci_dev, sizeof(*lp), lp, lp->dma_addr);
-	free_netdev(dev);
-	pci_disable_device(pdev);
-	pci_set_drvdata(pdev, NULL);
-    }
+	struct net_device *dev = pci_get_drvdata(pdev);
+
+	if (dev) {
+		struct pcnet32_private *lp = dev->priv;
+
+		unregister_netdev(dev);
+		pcnet32_free_ring(dev);
+		release_region(dev->base_addr, PCNET32_TOTAL_SIZE);
+		pci_free_consistent(lp->pci_dev, sizeof(*lp), lp, lp->dma_addr);
+		free_netdev(dev);
+		pci_disable_device(pdev);
+		pci_set_drvdata(pdev, NULL);
+	}
 }
 
 static struct pci_driver pcnet32_driver = {
-    .name	= DRV_NAME,
-    .probe	= pcnet32_probe_pci,
-    .remove	= __devexit_p(pcnet32_remove_one),
-    .id_table	= pcnet32_pci_tbl,
+	.name = DRV_NAME,
+	.probe = pcnet32_probe_pci,
+	.remove = __devexit_p(pcnet32_remove_one),
+	.id_table = pcnet32_pci_tbl,
 };
 
 /* An additional parameter that may be passed in... */
@@ -2477,9 +2698,11 @@ static int pcnet32_have_pci;
 module_param(debug, int, 0);
 MODULE_PARM_DESC(debug, DRV_NAME " debug level");
 module_param(max_interrupt_work, int, 0);
-MODULE_PARM_DESC(max_interrupt_work, DRV_NAME " maximum events handled per interrupt");
+MODULE_PARM_DESC(max_interrupt_work,
+		 DRV_NAME " maximum events handled per interrupt");
 module_param(rx_copybreak, int, 0);
-MODULE_PARM_DESC(rx_copybreak, DRV_NAME " copy breakpoint for copy-only-tiny-frames");
+MODULE_PARM_DESC(rx_copybreak,
+		 DRV_NAME " copy breakpoint for copy-only-tiny-frames");
 module_param(tx_start_pt, int, 0);
 MODULE_PARM_DESC(tx_start_pt, DRV_NAME " transmit start point (0-3)");
 module_param(pcnet32vlb, int, 0);
@@ -2490,7 +2713,9 @@ module_param_array(full_duplex, int, NULL, 0);
 MODULE_PARM_DESC(full_duplex, DRV_NAME " full duplex setting(s) (1)");
 /* Module Parameter for HomePNA cards added by Patrick Simmons, 2004 */
 module_param_array(homepna, int, NULL, 0);
-MODULE_PARM_DESC(homepna, DRV_NAME " mode for 79C978 cards (1 for HomePNA, 0 for Ethernet, default Ethernet");
+MODULE_PARM_DESC(homepna,
+		 DRV_NAME
+		 " mode for 79C978 cards (1 for HomePNA, 0 for Ethernet, default Ethernet");
 
 MODULE_AUTHOR("Thomas Bogendoerfer");
 MODULE_DESCRIPTION("Driver for PCnet32 and PCnetPCI based ethercards");
@@ -2500,44 +2725,44 @@ MODULE_LICENSE("GPL");
 
 static int __init pcnet32_init_module(void)
 {
-    printk(KERN_INFO "%s", version);
+	printk(KERN_INFO "%s", version);
 
-    pcnet32_debug = netif_msg_init(debug, PCNET32_MSG_DEFAULT);
+	pcnet32_debug = netif_msg_init(debug, PCNET32_MSG_DEFAULT);
 
-    if ((tx_start_pt >= 0) && (tx_start_pt <= 3))
-	tx_start = tx_start_pt;
+	if ((tx_start_pt >= 0) && (tx_start_pt <= 3))
+		tx_start = tx_start_pt;
 
-    /* find the PCI devices */
-    if (!pci_module_init(&pcnet32_driver))
-	pcnet32_have_pci = 1;
+	/* find the PCI devices */
+	if (!pci_module_init(&pcnet32_driver))
+		pcnet32_have_pci = 1;
 
-    /* should we find any remaining VLbus devices ? */
-    if (pcnet32vlb)
-	pcnet32_probe_vlbus();
+	/* should we find any remaining VLbus devices ? */
+	if (pcnet32vlb)
+		pcnet32_probe_vlbus();
 
-    if (cards_found && (pcnet32_debug & NETIF_MSG_PROBE))
-	printk(KERN_INFO PFX "%d cards_found.\n", cards_found);
+	if (cards_found && (pcnet32_debug & NETIF_MSG_PROBE))
+		printk(KERN_INFO PFX "%d cards_found.\n", cards_found);
 
-    return (pcnet32_have_pci + cards_found) ? 0 : -ENODEV;
+	return (pcnet32_have_pci + cards_found) ? 0 : -ENODEV;
 }
 
 static void __exit pcnet32_cleanup_module(void)
 {
-    struct net_device *next_dev;
-
-    while (pcnet32_dev) {
-	struct pcnet32_private *lp = pcnet32_dev->priv;
-	next_dev = lp->next;
-	unregister_netdev(pcnet32_dev);
-	pcnet32_free_ring(pcnet32_dev);
-	release_region(pcnet32_dev->base_addr, PCNET32_TOTAL_SIZE);
-	pci_free_consistent(lp->pci_dev, sizeof(*lp), lp, lp->dma_addr);
-	free_netdev(pcnet32_dev);
-	pcnet32_dev = next_dev;
-    }
+	struct net_device *next_dev;
+
+	while (pcnet32_dev) {
+		struct pcnet32_private *lp = pcnet32_dev->priv;
+		next_dev = lp->next;
+		unregister_netdev(pcnet32_dev);
+		pcnet32_free_ring(pcnet32_dev);
+		release_region(pcnet32_dev->base_addr, PCNET32_TOTAL_SIZE);
+		pci_free_consistent(lp->pci_dev, sizeof(*lp), lp, lp->dma_addr);
+		free_netdev(pcnet32_dev);
+		pcnet32_dev = next_dev;
+	}
 
-    if (pcnet32_have_pci)
-	pci_unregister_driver(&pcnet32_driver);
+	if (pcnet32_have_pci)
+		pci_unregister_driver(&pcnet32_driver);
 }
 
 module_init(pcnet32_init_module);
diff --git a/drivers/net/skfp/fplustm.c b/drivers/net/skfp/fplustm.c
index a4b2b6975d6..0784f558ca9 100644
--- a/drivers/net/skfp/fplustm.c
+++ b/drivers/net/skfp/fplustm.c
@@ -549,12 +549,12 @@ void formac_tx_restart(struct s_smc *smc)
 static void enable_formac(struct s_smc *smc)
 {
 	/* set formac IMSK : 0 enables irq */
-	outpw(FM_A(FM_IMSK1U),~mac_imsk1u) ;
-	outpw(FM_A(FM_IMSK1L),~mac_imsk1l) ;
-	outpw(FM_A(FM_IMSK2U),~mac_imsk2u) ;
-	outpw(FM_A(FM_IMSK2L),~mac_imsk2l) ;
-	outpw(FM_A(FM_IMSK3U),~mac_imsk3u) ;
-	outpw(FM_A(FM_IMSK3L),~mac_imsk3l) ;
+	outpw(FM_A(FM_IMSK1U),(unsigned short)~mac_imsk1u);
+	outpw(FM_A(FM_IMSK1L),(unsigned short)~mac_imsk1l);
+	outpw(FM_A(FM_IMSK2U),(unsigned short)~mac_imsk2u);
+	outpw(FM_A(FM_IMSK2L),(unsigned short)~mac_imsk2l);
+	outpw(FM_A(FM_IMSK3U),(unsigned short)~mac_imsk3u);
+	outpw(FM_A(FM_IMSK3L),(unsigned short)~mac_imsk3l);
 }
 
 #if 0	/* Removed because the driver should use the ASICs TX complete IRQ. */
diff --git a/drivers/net/skge.c b/drivers/net/skge.c
index 25e028b7ce4..4eda81d41b1 100644
--- a/drivers/net/skge.c
+++ b/drivers/net/skge.c
@@ -44,7 +44,7 @@
 #include "skge.h"
 
 #define DRV_NAME		"skge"
-#define DRV_VERSION		"1.3"
+#define DRV_VERSION		"1.4"
 #define PFX			DRV_NAME " "
 
 #define DEFAULT_TX_RING_SIZE	128
@@ -104,7 +104,6 @@ static const int txqaddr[] = { Q_XA1, Q_XA2 };
 static const int rxqaddr[] = { Q_R1, Q_R2 };
 static const u32 rxirqmask[] = { IS_R1_F, IS_R2_F };
 static const u32 txirqmask[] = { IS_XA1_F, IS_XA2_F };
-static const u32 portirqmask[] = { IS_PORT_1, IS_PORT_2 };
 
 static int skge_get_regs_len(struct net_device *dev)
 {
@@ -728,19 +727,18 @@ static struct ethtool_ops skge_ethtool_ops = {
  * Allocate ring elements and chain them together
  * One-to-one association of board descriptors with ring elements
  */
-static int skge_ring_alloc(struct skge_ring *ring, void *vaddr, u64 base)
+static int skge_ring_alloc(struct skge_ring *ring, void *vaddr, u32 base)
 {
 	struct skge_tx_desc *d;
 	struct skge_element *e;
 	int i;
 
-	ring->start = kmalloc(sizeof(*e)*ring->count, GFP_KERNEL);
+	ring->start = kcalloc(sizeof(*e), ring->count, GFP_KERNEL);
 	if (!ring->start)
 		return -ENOMEM;
 
 	for (i = 0, e = ring->start, d = vaddr; i < ring->count; i++, e++, d++) {
 		e->desc = d;
-		e->skb = NULL;
 		if (i == ring->count - 1) {
 			e->next = ring->start;
 			d->next_offset = base;
@@ -2169,27 +2167,31 @@ static int skge_up(struct net_device *dev)
 	if (!skge->mem)
 		return -ENOMEM;
 
+	BUG_ON(skge->dma & 7);
+
+	if ((u64)skge->dma >> 32 != ((u64) skge->dma + skge->mem_size) >> 32) {
+		printk(KERN_ERR PFX "pci_alloc_consistent region crosses 4G boundary\n");
+		err = -EINVAL;
+		goto free_pci_mem;
+	}
+
 	memset(skge->mem, 0, skge->mem_size);
 
-	if ((err = skge_ring_alloc(&skge->rx_ring, skge->mem, skge->dma)))
+	err = skge_ring_alloc(&skge->rx_ring, skge->mem, skge->dma);
+	if (err)
 		goto free_pci_mem;
 
 	err = skge_rx_fill(skge);
 	if (err)
 		goto free_rx_ring;
 
-	if ((err = skge_ring_alloc(&skge->tx_ring, skge->mem + rx_size,
-				   skge->dma + rx_size)))
+	err = skge_ring_alloc(&skge->tx_ring, skge->mem + rx_size,
+			      skge->dma + rx_size);
+	if (err)
 		goto free_rx_ring;
 
 	skge->tx_avail = skge->tx_ring.count - 1;
 
-	/* Enable IRQ from port */
-	spin_lock_irq(&hw->hw_lock);
-	hw->intr_mask |= portirqmask[port];
-	skge_write32(hw, B0_IMSK, hw->intr_mask);
-	spin_unlock_irq(&hw->hw_lock);
-
 	/* Initialize MAC */
 	spin_lock_bh(&hw->phy_lock);
 	if (hw->chip_id == CHIP_ID_GENESIS)
@@ -2246,11 +2248,6 @@ static int skge_down(struct net_device *dev)
 	else
 		yukon_stop(skge);
 
-	spin_lock_irq(&hw->hw_lock);
-	hw->intr_mask &= ~portirqmask[skge->port];
-	skge_write32(hw, B0_IMSK, hw->intr_mask);
-	spin_unlock_irq(&hw->hw_lock);
-
 	/* Stop transmitter */
 	skge_write8(hw, Q_ADDR(txqaddr[port], Q_CSR), CSR_STOP);
 	skge_write32(hw, RB_ADDR(txqaddr[port], RB_CTRL),
@@ -2307,18 +2304,15 @@ static int skge_xmit_frame(struct sk_buff *skb, struct net_device *dev)
 	int i;
 	u32 control, len;
 	u64 map;
-	unsigned long flags;
 
 	skb = skb_padto(skb, ETH_ZLEN);
 	if (!skb)
 		return NETDEV_TX_OK;
 
-	local_irq_save(flags);
 	if (!spin_trylock(&skge->tx_lock)) {
- 		/* Collision - tell upper layer to requeue */
- 		local_irq_restore(flags);
- 		return NETDEV_TX_LOCKED;
- 	}
+		/* Collision - tell upper layer to requeue */
+		return NETDEV_TX_LOCKED;
+	}
 
 	if (unlikely(skge->tx_avail < skb_shinfo(skb)->nr_frags +1)) {
 		if (!netif_queue_stopped(dev)) {
@@ -2327,7 +2321,7 @@ static int skge_xmit_frame(struct sk_buff *skb, struct net_device *dev)
 			printk(KERN_WARNING PFX "%s: ring full when queue awake!\n",
 			       dev->name);
 		}
-		spin_unlock_irqrestore(&skge->tx_lock, flags);
+		spin_unlock(&skge->tx_lock);
 		return NETDEV_TX_BUSY;
 	}
 
@@ -2402,8 +2396,10 @@ static int skge_xmit_frame(struct sk_buff *skb, struct net_device *dev)
 		netif_stop_queue(dev);
 	}
 
+	mmiowb();
+	spin_unlock(&skge->tx_lock);
+
 	dev->trans_start = jiffies;
-	spin_unlock_irqrestore(&skge->tx_lock, flags);
 
 	return NETDEV_TX_OK;
 }
@@ -2416,7 +2412,7 @@ static inline void skge_tx_free(struct skge_hw *hw, struct skge_element *e)
 			       pci_unmap_addr(e, mapaddr),
 			       pci_unmap_len(e, maplen),
 			       PCI_DMA_TODEVICE);
-		dev_kfree_skb_any(e->skb);
+		dev_kfree_skb(e->skb);
 		e->skb = NULL;
 	} else {
 		pci_unmap_page(hw->pdev,
@@ -2430,15 +2426,14 @@ static void skge_tx_clean(struct skge_port *skge)
 {
 	struct skge_ring *ring = &skge->tx_ring;
 	struct skge_element *e;
-	unsigned long flags;
 
-	spin_lock_irqsave(&skge->tx_lock, flags);
+	spin_lock_bh(&skge->tx_lock);
 	for (e = ring->to_clean; e != ring->to_use; e = e->next) {
 		++skge->tx_avail;
 		skge_tx_free(skge->hw, e);
 	}
 	ring->to_clean = e;
-	spin_unlock_irqrestore(&skge->tx_lock, flags);
+	spin_unlock_bh(&skge->tx_lock);
 }
 
 static void skge_tx_timeout(struct net_device *dev)
@@ -2663,6 +2658,37 @@ resubmit:
 	return NULL;
 }
 
+static void skge_tx_done(struct skge_port *skge)
+{
+	struct skge_ring *ring = &skge->tx_ring;
+	struct skge_element *e;
+
+	spin_lock(&skge->tx_lock);
+	for (e = ring->to_clean; prefetch(e->next), e != ring->to_use; e = e->next) {
+		struct skge_tx_desc *td = e->desc;
+		u32 control;
+
+		rmb();
+		control = td->control;
+		if (control & BMU_OWN)
+			break;
+
+		if (unlikely(netif_msg_tx_done(skge)))
+			printk(KERN_DEBUG PFX "%s: tx done slot %td status 0x%x\n",
+			       skge->netdev->name, e - ring->start, td->status);
+
+		skge_tx_free(skge->hw, e);
+		e->skb = NULL;
+		++skge->tx_avail;
+	}
+	ring->to_clean = e;
+	skge_write8(skge->hw, Q_ADDR(txqaddr[skge->port], Q_CSR), CSR_IRQ_CL_F);
+
+	if (skge->tx_avail > MAX_SKB_FRAGS + 1)
+		netif_wake_queue(skge->netdev);
+
+	spin_unlock(&skge->tx_lock);
+}
 
 static int skge_poll(struct net_device *dev, int *budget)
 {
@@ -2670,8 +2696,10 @@ static int skge_poll(struct net_device *dev, int *budget)
 	struct skge_hw *hw = skge->hw;
 	struct skge_ring *ring = &skge->rx_ring;
 	struct skge_element *e;
-	unsigned int to_do = min(dev->quota, *budget);
-	unsigned int work_done = 0;
+	int to_do = min(dev->quota, *budget);
+	int work_done = 0;
+
+	skge_tx_done(skge);
 
 	for (e = ring->to_clean; prefetch(e->next), work_done < to_do; e = e->next) {
 		struct skge_rx_desc *rd = e->desc;
@@ -2683,8 +2711,8 @@ static int skge_poll(struct net_device *dev, int *budget)
 		if (control & BMU_OWN)
 			break;
 
- 		skb = skge_rx_get(skge, e, control, rd->status,
- 				  le16_to_cpu(rd->csum2));
+		skb = skge_rx_get(skge, e, control, rd->status,
+				  le16_to_cpu(rd->csum2));
 		if (likely(skb)) {
 			dev->last_rx = jiffies;
 			netif_receive_skb(skb);
@@ -2705,49 +2733,15 @@ static int skge_poll(struct net_device *dev, int *budget)
 	if (work_done >=  to_do)
 		return 1; /* not done */
 
-	spin_lock_irq(&hw->hw_lock);
-	__netif_rx_complete(dev);
-  	hw->intr_mask |= portirqmask[skge->port];
+	netif_rx_complete(dev);
+	mmiowb();
+
+  	hw->intr_mask |= skge->port == 0 ? (IS_R1_F|IS_XA1_F) : (IS_R2_F|IS_XA2_F);
   	skge_write32(hw, B0_IMSK, hw->intr_mask);
- 	spin_unlock_irq(&hw->hw_lock);
 
 	return 0;
 }
 
-static inline void skge_tx_intr(struct net_device *dev)
-{
-	struct skge_port *skge = netdev_priv(dev);
-	struct skge_hw *hw = skge->hw;
-	struct skge_ring *ring = &skge->tx_ring;
-	struct skge_element *e;
-
-	spin_lock(&skge->tx_lock);
-	for (e = ring->to_clean; prefetch(e->next), e != ring->to_use; e = e->next) {
-		struct skge_tx_desc *td = e->desc;
-		u32 control;
-
-		rmb();
-		control = td->control;
-		if (control & BMU_OWN)
-			break;
-
-		if (unlikely(netif_msg_tx_done(skge)))
-			printk(KERN_DEBUG PFX "%s: tx done slot %td status 0x%x\n",
-			       dev->name, e - ring->start, td->status);
-
-		skge_tx_free(hw, e);
-		e->skb = NULL;
-		++skge->tx_avail;
-	}
-	ring->to_clean = e;
-	skge_write8(hw, Q_ADDR(txqaddr[skge->port], Q_CSR), CSR_IRQ_CL_F);
-
-	if (skge->tx_avail > MAX_SKB_FRAGS + 1)
-		netif_wake_queue(dev);
-
-	spin_unlock(&skge->tx_lock);
-}
-
 /* Parity errors seem to happen when Genesis is connected to a switch
  * with no other ports present. Heartbeat error??
  */
@@ -2770,17 +2764,6 @@ static void skge_mac_parity(struct skge_hw *hw, int port)
 			    ? GMF_CLI_TX_FC : GMF_CLI_TX_PE);
 }
 
-static void skge_pci_clear(struct skge_hw *hw)
-{
-	u16 status;
-
-	pci_read_config_word(hw->pdev, PCI_STATUS, &status);
-	skge_write8(hw, B2_TST_CTRL1, TST_CFG_WRITE_ON);
-	pci_write_config_word(hw->pdev, PCI_STATUS,
-			      status | PCI_STATUS_ERROR_BITS);
-	skge_write8(hw, B2_TST_CTRL1, TST_CFG_WRITE_OFF);
-}
-
 static void skge_mac_intr(struct skge_hw *hw, int port)
 {
 	if (hw->chip_id == CHIP_ID_GENESIS)
@@ -2822,23 +2805,39 @@ static void skge_error_irq(struct skge_hw *hw)
 	if (hwstatus & IS_M2_PAR_ERR)
 		skge_mac_parity(hw, 1);
 
-	if (hwstatus & IS_R1_PAR_ERR)
+	if (hwstatus & IS_R1_PAR_ERR) {
+		printk(KERN_ERR PFX "%s: receive queue parity error\n",
+		       hw->dev[0]->name);
 		skge_write32(hw, B0_R1_CSR, CSR_IRQ_CL_P);
+	}
 
-	if (hwstatus & IS_R2_PAR_ERR)
+	if (hwstatus & IS_R2_PAR_ERR) {
+		printk(KERN_ERR PFX "%s: receive queue parity error\n",
+		       hw->dev[1]->name);
 		skge_write32(hw, B0_R2_CSR, CSR_IRQ_CL_P);
+	}
 
 	if (hwstatus & (IS_IRQ_MST_ERR|IS_IRQ_STAT)) {
-		printk(KERN_ERR PFX "hardware error detected (status 0x%x)\n",
-		       hwstatus);
+		u16 pci_status, pci_cmd;
+
+		pci_read_config_word(hw->pdev, PCI_COMMAND, &pci_cmd);
+		pci_read_config_word(hw->pdev, PCI_STATUS, &pci_status);
 
-		skge_pci_clear(hw);
+		printk(KERN_ERR PFX "%s: PCI error cmd=%#x status=%#x\n",
+			       pci_name(hw->pdev), pci_cmd, pci_status);
+
+		/* Write the error bits back to clear them. */
+		pci_status &= PCI_STATUS_ERROR_BITS;
+		skge_write8(hw, B2_TST_CTRL1, TST_CFG_WRITE_ON);
+		pci_write_config_word(hw->pdev, PCI_COMMAND,
+				      pci_cmd | PCI_COMMAND_SERR | PCI_COMMAND_PARITY);
+		pci_write_config_word(hw->pdev, PCI_STATUS, pci_status);
+		skge_write8(hw, B2_TST_CTRL1, TST_CFG_WRITE_OFF);
 
 		/* if error still set then just ignore it */
 		hwstatus = skge_read32(hw, B0_HWE_ISRC);
 		if (hwstatus & IS_IRQ_STAT) {
-			pr_debug("IRQ status %x: still set ignoring hardware errors\n",
-			       hwstatus);
+			printk(KERN_INFO PFX "unable to clear error (so ignoring them)\n");
 			hw->intr_mask &= ~IS_HW_ERR;
 		}
 	}
@@ -2855,12 +2854,11 @@ static void skge_extirq(unsigned long data)
 	int port;
 
 	spin_lock(&hw->phy_lock);
-	for (port = 0; port < 2; port++) {
+	for (port = 0; port < hw->ports; port++) {
 		struct net_device *dev = hw->dev[port];
+		struct skge_port *skge = netdev_priv(dev);
 
-		if (dev && netif_running(dev)) {
-			struct skge_port *skge = netdev_priv(dev);
-
+		if (netif_running(dev)) {
 			if (hw->chip_id != CHIP_ID_GENESIS)
 				yukon_phy_intr(skge);
 			else
@@ -2869,38 +2867,39 @@ static void skge_extirq(unsigned long data)
 	}
 	spin_unlock(&hw->phy_lock);
 
-	spin_lock_irq(&hw->hw_lock);
 	hw->intr_mask |= IS_EXT_REG;
 	skge_write32(hw, B0_IMSK, hw->intr_mask);
-	spin_unlock_irq(&hw->hw_lock);
 }
 
 static irqreturn_t skge_intr(int irq, void *dev_id, struct pt_regs *regs)
 {
 	struct skge_hw *hw = dev_id;
-	u32 status = skge_read32(hw, B0_SP_ISRC);
+	u32 status;
 
-	if (status == 0 || status == ~0) /* hotplug or shared irq */
+	/* Reading this register masks IRQ */
+	status = skge_read32(hw, B0_SP_ISRC);
+	if (status == 0)
 		return IRQ_NONE;
 
-	spin_lock(&hw->hw_lock);
-	if (status & IS_R1_F) {
+	if (status & IS_EXT_REG) {
+		hw->intr_mask &= ~IS_EXT_REG;
+		tasklet_schedule(&hw->ext_tasklet);
+	}
+
+	if (status & (IS_R1_F|IS_XA1_F)) {
 		skge_write8(hw, Q_ADDR(Q_R1, Q_CSR), CSR_IRQ_CL_F);
-		hw->intr_mask &= ~IS_R1_F;
+		hw->intr_mask &= ~(IS_R1_F|IS_XA1_F);
 		netif_rx_schedule(hw->dev[0]);
 	}
 
-	if (status & IS_R2_F) {
+	if (status & (IS_R2_F|IS_XA2_F)) {
 		skge_write8(hw, Q_ADDR(Q_R2, Q_CSR), CSR_IRQ_CL_F);
-		hw->intr_mask &= ~IS_R2_F;
+		hw->intr_mask &= ~(IS_R2_F|IS_XA2_F);
 		netif_rx_schedule(hw->dev[1]);
 	}
 
-	if (status & IS_XA1_F)
-		skge_tx_intr(hw->dev[0]);
-
-	if (status & IS_XA2_F)
-		skge_tx_intr(hw->dev[1]);
+	if (likely((status & hw->intr_mask) == 0))
+		return IRQ_HANDLED;
 
 	if (status & IS_PA_TO_RX1) {
 		struct skge_port *skge = netdev_priv(hw->dev[0]);
@@ -2929,13 +2928,7 @@ static irqreturn_t skge_intr(int irq, void *dev_id, struct pt_regs *regs)
 	if (status & IS_HW_ERR)
 		skge_error_irq(hw);
 
-	if (status & IS_EXT_REG) {
-		hw->intr_mask &= ~IS_EXT_REG;
-		tasklet_schedule(&hw->ext_tasklet);
-	}
-
 	skge_write32(hw, B0_IMSK, hw->intr_mask);
-	spin_unlock(&hw->hw_lock);
 
 	return IRQ_HANDLED;
 }
@@ -3010,7 +3003,7 @@ static const char *skge_board_name(const struct skge_hw *hw)
 static int skge_reset(struct skge_hw *hw)
 {
 	u32 reg;
-	u16 ctst;
+	u16 ctst, pci_status;
 	u8 t8, mac_cfg, pmd_type, phy_type;
 	int i;
 
@@ -3021,8 +3014,13 @@ static int skge_reset(struct skge_hw *hw)
 	skge_write8(hw, B0_CTST, CS_RST_CLR);
 
 	/* clear PCI errors, if any */
-	skge_pci_clear(hw);
+	skge_write8(hw, B2_TST_CTRL1, TST_CFG_WRITE_ON);
+	skge_write8(hw, B2_TST_CTRL2, 0);
 
+	pci_read_config_word(hw->pdev, PCI_STATUS, &pci_status);
+	pci_write_config_word(hw->pdev, PCI_STATUS,
+			      pci_status | PCI_STATUS_ERROR_BITS);
+	skge_write8(hw, B2_TST_CTRL1, TST_CFG_WRITE_OFF);
 	skge_write8(hw, B0_CTST, CS_MRST_CLR);
 
 	/* restore CLK_RUN bits (for Yukon-Lite) */
@@ -3081,7 +3079,10 @@ static int skge_reset(struct skge_hw *hw)
 	else
 		hw->ram_size = t8 * 4096;
 
-	hw->intr_mask = IS_HW_ERR | IS_EXT_REG;
+	hw->intr_mask = IS_HW_ERR | IS_EXT_REG | IS_PORT_1;
+	if (hw->ports > 1)
+		hw->intr_mask |= IS_PORT_2;
+
 	if (hw->chip_id == CHIP_ID_GENESIS)
 		genesis_init(hw);
 	else {
@@ -3251,13 +3252,15 @@ static int __devinit skge_probe(struct pci_dev *pdev,
 	struct skge_hw *hw;
 	int err, using_dac = 0;
 
-	if ((err = pci_enable_device(pdev))) {
+	err = pci_enable_device(pdev);
+	if (err) {
 		printk(KERN_ERR PFX "%s cannot enable PCI device\n",
 		       pci_name(pdev));
 		goto err_out;
 	}
 
-	if ((err = pci_request_regions(pdev, DRV_NAME))) {
+	err = pci_request_regions(pdev, DRV_NAME);
+	if (err) {
 		printk(KERN_ERR PFX "%s cannot obtain PCI resources\n",
 		       pci_name(pdev));
 		goto err_out_disable_pdev;
@@ -3265,22 +3268,18 @@ static int __devinit skge_probe(struct pci_dev *pdev,
 
 	pci_set_master(pdev);
 
-	if (sizeof(dma_addr_t) > sizeof(u32) &&
-	    !(err = pci_set_dma_mask(pdev, DMA_64BIT_MASK))) {
+	if (!pci_set_dma_mask(pdev, DMA_64BIT_MASK)) {
 		using_dac = 1;
 		err = pci_set_consistent_dma_mask(pdev, DMA_64BIT_MASK);
-		if (err < 0) {
-			printk(KERN_ERR PFX "%s unable to obtain 64 bit DMA "
-			       "for consistent allocations\n", pci_name(pdev));
-			goto err_out_free_regions;
-		}
-	} else {
-		err = pci_set_dma_mask(pdev, DMA_32BIT_MASK);
-		if (err) {
-			printk(KERN_ERR PFX "%s no usable DMA configuration\n",
-			       pci_name(pdev));
-			goto err_out_free_regions;
-		}
+	} else if (!(err = pci_set_dma_mask(pdev, DMA_32BIT_MASK))) {
+		using_dac = 0;
+		err = pci_set_consistent_dma_mask(pdev, DMA_32BIT_MASK);
+	}
+
+	if (err) {
+		printk(KERN_ERR PFX "%s no usable DMA configuration\n",
+		       pci_name(pdev));
+		goto err_out_free_regions;
 	}
 
 #ifdef __BIG_ENDIAN
@@ -3304,7 +3303,6 @@ static int __devinit skge_probe(struct pci_dev *pdev,
 
 	hw->pdev = pdev;
 	spin_lock_init(&hw->phy_lock);
-	spin_lock_init(&hw->hw_lock);
 	tasklet_init(&hw->ext_tasklet, skge_extirq, (unsigned long) hw);
 
 	hw->regs = ioremap_nocache(pci_resource_start(pdev, 0), 0x4000);
@@ -3314,7 +3312,8 @@ static int __devinit skge_probe(struct pci_dev *pdev,
 		goto err_out_free_hw;
 	}
 
-	if ((err = request_irq(pdev->irq, skge_intr, SA_SHIRQ, DRV_NAME, hw))) {
+	err = request_irq(pdev->irq, skge_intr, SA_SHIRQ, DRV_NAME, hw);
+	if (err) {
 		printk(KERN_ERR PFX "%s: cannot assign irq %d\n",
 		       pci_name(pdev), pdev->irq);
 		goto err_out_iounmap;
@@ -3332,7 +3331,8 @@ static int __devinit skge_probe(struct pci_dev *pdev,
 	if ((dev = skge_devinit(hw, 0, using_dac)) == NULL)
 		goto err_out_led_off;
 
-	if ((err = register_netdev(dev))) {
+	err = register_netdev(dev);
+	if (err) {
 		printk(KERN_ERR PFX "%s: cannot register net device\n",
 		       pci_name(pdev));
 		goto err_out_free_netdev;
@@ -3387,7 +3387,6 @@ static void __devexit skge_remove(struct pci_dev *pdev)
 
 	skge_write32(hw, B0_IMSK, 0);
 	skge_write16(hw, B0_LED, LED_STAT_OFF);
-	skge_pci_clear(hw);
 	skge_write8(hw, B0_CTST, CS_RST_SET);
 
 	tasklet_kill(&hw->ext_tasklet);
diff --git a/drivers/net/skge.h b/drivers/net/skge.h
index 941f12a333b..2efdacc290e 100644
--- a/drivers/net/skge.h
+++ b/drivers/net/skge.h
@@ -2402,7 +2402,6 @@ struct skge_hw {
 
 	struct tasklet_struct ext_tasklet;
 	spinlock_t	     phy_lock;
-	spinlock_t	     hw_lock;
 };
 
 enum {
diff --git a/drivers/net/sky2.c b/drivers/net/sky2.c
index 73260364cba..f08fe6c884b 100644
--- a/drivers/net/sky2.c
+++ b/drivers/net/sky2.c
@@ -51,7 +51,7 @@
 #include "sky2.h"
 
 #define DRV_NAME		"sky2"
-#define DRV_VERSION		"0.15"
+#define DRV_VERSION		"1.1"
 #define PFX			DRV_NAME " "
 
 /*
@@ -61,10 +61,6 @@
  * a receive requires one (or two if using 64 bit dma).
  */
 
-#define is_ec_a1(hw) \
-	unlikely((hw)->chip_id == CHIP_ID_YUKON_EC && \
-		 (hw)->chip_rev == CHIP_REV_YU_EC_A1)
-
 #define RX_LE_SIZE	    	512
 #define RX_LE_BYTES		(RX_LE_SIZE*sizeof(struct sky2_rx_le))
 #define RX_MAX_PENDING		(RX_LE_SIZE/2 - 2)
@@ -96,6 +92,10 @@ static int copybreak __read_mostly = 256;
 module_param(copybreak, int, 0);
 MODULE_PARM_DESC(copybreak, "Receive copy threshold");
 
+static int disable_msi = 0;
+module_param(disable_msi, int, 0);
+MODULE_PARM_DESC(disable_msi, "Disable Message Signaled Interrupt (MSI)");
+
 static const struct pci_device_id sky2_id_table[] = {
 	{ PCI_DEVICE(PCI_VENDOR_ID_SYSKONNECT, 0x9000) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_SYSKONNECT, 0x9E00) },
@@ -504,9 +504,9 @@ static void sky2_phy_init(struct sky2_hw *hw, unsigned port)
 /* Force a renegotiation */
 static void sky2_phy_reinit(struct sky2_port *sky2)
 {
-	down(&sky2->phy_sema);
+	spin_lock_bh(&sky2->phy_lock);
 	sky2_phy_init(sky2->hw, sky2->port);
-	up(&sky2->phy_sema);
+	spin_unlock_bh(&sky2->phy_lock);
 }
 
 static void sky2_mac_init(struct sky2_hw *hw, unsigned port)
@@ -571,9 +571,9 @@ static void sky2_mac_init(struct sky2_hw *hw, unsigned port)
 
 	sky2_read16(hw, SK_REG(port, GMAC_IRQ_SRC));
 
-	down(&sky2->phy_sema);
+	spin_lock_bh(&sky2->phy_lock);
 	sky2_phy_init(hw, port);
-	up(&sky2->phy_sema);
+	spin_unlock_bh(&sky2->phy_lock);
 
 	/* MIB clear */
 	reg = gma_read16(hw, port, GM_PHY_ADDR);
@@ -725,37 +725,11 @@ static inline struct sky2_tx_le *get_tx_le(struct sky2_port *sky2)
 	return le;
 }
 
-/*
- * This is a workaround code taken from SysKonnect sk98lin driver
- * to deal with chip bug on Yukon EC rev 0 in the wraparound case.
- */
-static void sky2_put_idx(struct sky2_hw *hw, unsigned q,
-				u16 idx, u16 *last, u16 size)
+/* Update chip's next pointer */
+static inline void sky2_put_idx(struct sky2_hw *hw, unsigned q, u16 idx)
 {
 	wmb();
-	if (is_ec_a1(hw) && idx < *last) {
-		u16 hwget = sky2_read16(hw, Y2_QADDR(q, PREF_UNIT_GET_IDX));
-
-		if (hwget == 0) {
-			/* Start prefetching again */
-			sky2_write8(hw, Y2_QADDR(q, PREF_UNIT_FIFO_WM), 0xe0);
-			goto setnew;
-		}
-
-		if (hwget == size - 1) {
-			/* set watermark to one list element */
-			sky2_write8(hw, Y2_QADDR(q, PREF_UNIT_FIFO_WM), 8);
-
-			/* set put index to first list element */
-			sky2_write16(hw, Y2_QADDR(q, PREF_UNIT_PUT_IDX), 0);
-		} else		/* have hardware go to end of list */
-			sky2_write16(hw, Y2_QADDR(q, PREF_UNIT_PUT_IDX),
-				     size - 1);
-	} else {
-setnew:
-		sky2_write16(hw, Y2_QADDR(q, PREF_UNIT_PUT_IDX), idx);
-	}
-	*last = idx;
+	sky2_write16(hw, Y2_QADDR(q, PREF_UNIT_PUT_IDX), idx);
 	mmiowb();
 }
 
@@ -878,7 +852,7 @@ static int sky2_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 	if (!netif_running(dev))
 		return -ENODEV;	/* Phy still in reset */
 
-	switch(cmd) {
+	switch (cmd) {
 	case SIOCGMIIPHY:
 		data->phy_id = PHY_ADDR_MARV;
 
@@ -886,9 +860,9 @@ static int sky2_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 	case SIOCGMIIREG: {
 		u16 val = 0;
 
-		down(&sky2->phy_sema);
+		spin_lock_bh(&sky2->phy_lock);
 		err = __gm_phy_read(hw, sky2->port, data->reg_num & 0x1f, &val);
-		up(&sky2->phy_sema);
+		spin_unlock_bh(&sky2->phy_lock);
 
 		data->val_out = val;
 		break;
@@ -898,10 +872,10 @@ static int sky2_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 		if (!capable(CAP_NET_ADMIN))
 			return -EPERM;
 
-		down(&sky2->phy_sema);
+		spin_lock_bh(&sky2->phy_lock);
 		err = gm_phy_write(hw, sky2->port, data->reg_num & 0x1f,
 				   data->val_in);
-		up(&sky2->phy_sema);
+		spin_unlock_bh(&sky2->phy_lock);
 		break;
 	}
 	return err;
@@ -1001,7 +975,6 @@ static int sky2_rx_start(struct sky2_port *sky2)
 
 	/* Tell chip about available buffers */
 	sky2_write16(hw, Y2_QADDR(rxq, PREF_UNIT_PUT_IDX), sky2->rx_put);
-	sky2->rx_last_put = sky2_read16(hw, Y2_QADDR(rxq, PREF_UNIT_PUT_IDX));
 	return 0;
 nomem:
 	sky2_rx_clean(sky2);
@@ -1014,7 +987,7 @@ static int sky2_up(struct net_device *dev)
 	struct sky2_port *sky2 = netdev_priv(dev);
 	struct sky2_hw *hw = sky2->hw;
 	unsigned port = sky2->port;
-	u32 ramsize, rxspace;
+	u32 ramsize, rxspace, imask;
 	int err = -ENOMEM;
 
 	if (netif_msg_ifup(sky2))
@@ -1079,10 +1052,10 @@ static int sky2_up(struct net_device *dev)
 		goto err_out;
 
 	/* Enable interrupts from phy/mac for port */
-	spin_lock_irq(&hw->hw_lock);
-	hw->intr_mask |= (port == 0) ? Y2_IS_PORT_1 : Y2_IS_PORT_2;
-	sky2_write32(hw, B0_IMSK, hw->intr_mask);
-	spin_unlock_irq(&hw->hw_lock);
+	imask = sky2_read32(hw, B0_IMSK);
+	imask |= (port == 0) ? Y2_IS_PORT_1 : Y2_IS_PORT_2;
+	sky2_write32(hw, B0_IMSK, imask);
+
 	return 0;
 
 err_out:
@@ -1299,8 +1272,7 @@ static int sky2_xmit_frame(struct sk_buff *skb, struct net_device *dev)
 			netif_stop_queue(dev);
 	}
 
-	sky2_put_idx(hw, txqaddr[sky2->port], sky2->tx_prod,
-		     &sky2->tx_last_put, TX_RING_SIZE);
+	sky2_put_idx(hw, txqaddr[sky2->port], sky2->tx_prod);
 
 out_unlock:
 	spin_unlock(&sky2->tx_lock);
@@ -1332,7 +1304,7 @@ static void sky2_tx_complete(struct sky2_port *sky2, u16 done)
 		struct tx_ring_info *re = sky2->tx_ring + put;
 		struct sk_buff *skb = re->skb;
 
-  		nxt = re->idx;
+		nxt = re->idx;
 		BUG_ON(nxt >= TX_RING_SIZE);
 		prefetch(sky2->tx_ring + nxt);
 
@@ -1348,7 +1320,7 @@ static void sky2_tx_complete(struct sky2_port *sky2, u16 done)
 			struct tx_ring_info *fre;
 			fre = sky2->tx_ring + (put + i + 1) % TX_RING_SIZE;
 			pci_unmap_page(pdev, pci_unmap_addr(fre, mapaddr),
-  				       skb_shinfo(skb)->frags[i].size,
+				       skb_shinfo(skb)->frags[i].size,
 				       PCI_DMA_TODEVICE);
 		}
 
@@ -1356,7 +1328,7 @@ static void sky2_tx_complete(struct sky2_port *sky2, u16 done)
 	}
 
 	sky2->tx_cons = put;
-	if (netif_queue_stopped(dev) && tx_avail(sky2) > MAX_SKB_TX_LE)
+	if (tx_avail(sky2) > MAX_SKB_TX_LE)
 		netif_wake_queue(dev);
 }
 
@@ -1375,6 +1347,7 @@ static int sky2_down(struct net_device *dev)
 	struct sky2_hw *hw = sky2->hw;
 	unsigned port = sky2->port;
 	u16 ctrl;
+	u32 imask;
 
 	/* Never really got started! */
 	if (!sky2->tx_le)
@@ -1386,14 +1359,6 @@ static int sky2_down(struct net_device *dev)
 	/* Stop more packets from being queued */
 	netif_stop_queue(dev);
 
-	/* Disable port IRQ */
-	spin_lock_irq(&hw->hw_lock);
-	hw->intr_mask &= ~((sky2->port == 0) ? Y2_IS_IRQ_PHY1 : Y2_IS_IRQ_PHY2);
-	sky2_write32(hw, B0_IMSK, hw->intr_mask);
-	spin_unlock_irq(&hw->hw_lock);
-
-	flush_scheduled_work();
-
 	sky2_phy_reset(hw, port);
 
 	/* Stop transmitter */
@@ -1437,6 +1402,11 @@ static int sky2_down(struct net_device *dev)
 	sky2_write8(hw, SK_REG(port, RX_GMF_CTRL_T), GMF_RST_SET);
 	sky2_write8(hw, SK_REG(port, TX_GMF_CTRL_T), GMF_RST_SET);
 
+	/* Disable port IRQ */
+	imask = sky2_read32(hw, B0_IMSK);
+	imask &= ~(sky2->port == 0) ? Y2_IS_PORT_1 : Y2_IS_PORT_2;
+	sky2_write32(hw, B0_IMSK, imask);
+
 	/* turn off LED's */
 	sky2_write16(hw, B0_Y2LED, LED_STAT_OFF);
 
@@ -1631,20 +1601,19 @@ static int sky2_autoneg_done(struct sky2_port *sky2, u16 aux)
 	return 0;
 }
 
-/*
- * Interrupt from PHY are handled outside of interrupt context
- * because accessing phy registers requires spin wait which might
- * cause excess interrupt latency.
- */
-static void sky2_phy_task(void *arg)
+/* Interrupt from PHY */
+static void sky2_phy_intr(struct sky2_hw *hw, unsigned port)
 {
-	struct sky2_port *sky2 = arg;
-	struct sky2_hw *hw = sky2->hw;
+	struct net_device *dev = hw->dev[port];
+	struct sky2_port *sky2 = netdev_priv(dev);
 	u16 istatus, phystat;
 
-	down(&sky2->phy_sema);
-	istatus = gm_phy_read(hw, sky2->port, PHY_MARV_INT_STAT);
-	phystat = gm_phy_read(hw, sky2->port, PHY_MARV_PHY_STAT);
+	spin_lock(&sky2->phy_lock);
+	istatus = gm_phy_read(hw, port, PHY_MARV_INT_STAT);
+	phystat = gm_phy_read(hw, port, PHY_MARV_PHY_STAT);
+
+	if (!netif_running(dev))
+		goto out;
 
 	if (netif_msg_intr(sky2))
 		printk(KERN_INFO PFX "%s: phy interrupt status 0x%x 0x%x\n",
@@ -1670,12 +1639,7 @@ static void sky2_phy_task(void *arg)
 			sky2_link_down(sky2);
 	}
 out:
-	up(&sky2->phy_sema);
-
-	spin_lock_irq(&hw->hw_lock);
-	hw->intr_mask |= (sky2->port == 0) ? Y2_IS_IRQ_PHY1 : Y2_IS_IRQ_PHY2;
-	sky2_write32(hw, B0_IMSK, hw->intr_mask);
-	spin_unlock_irq(&hw->hw_lock);
+	spin_unlock(&sky2->phy_lock);
 }
 
 
@@ -1687,31 +1651,40 @@ static void sky2_tx_timeout(struct net_device *dev)
 	struct sky2_port *sky2 = netdev_priv(dev);
 	struct sky2_hw *hw = sky2->hw;
 	unsigned txq = txqaddr[sky2->port];
-	u16 ridx;
-
-	/* Maybe we just missed an status interrupt */
-	spin_lock(&sky2->tx_lock);
-	ridx = sky2_read16(hw,
-			   sky2->port == 0 ? STAT_TXA1_RIDX : STAT_TXA2_RIDX);
-	sky2_tx_complete(sky2, ridx);
-	spin_unlock(&sky2->tx_lock);
-
-	if (!netif_queue_stopped(dev)) {
-		if (net_ratelimit())
-			pr_info(PFX "transmit interrupt missed? recovered\n");
-		return;
-	}
+	u16 report, done;
 
 	if (netif_msg_timer(sky2))
 		printk(KERN_ERR PFX "%s: tx timeout\n", dev->name);
 
-	sky2_write32(hw, Q_ADDR(txq, Q_CSR), BMU_STOP);
-	sky2_write32(hw, Y2_QADDR(txq, PREF_UNIT_CTRL), PREF_UNIT_RST_SET);
+	report = sky2_read16(hw, sky2->port == 0 ? STAT_TXA1_RIDX : STAT_TXA2_RIDX);
+	done = sky2_read16(hw, Q_ADDR(txq, Q_DONE));
 
-	sky2_tx_clean(sky2);
+	printk(KERN_DEBUG PFX "%s: transmit ring %u .. %u report=%u done=%u\n",
+	       dev->name,
+	       sky2->tx_cons, sky2->tx_prod, report, done);
 
-	sky2_qset(hw, txq);
-	sky2_prefetch_init(hw, txq, sky2->tx_le_map, TX_RING_SIZE - 1);
+	if (report != done) {
+		printk(KERN_INFO PFX "status burst pending (irq moderation?)\n");
+
+		sky2_write8(hw, STAT_TX_TIMER_CTRL, TIM_STOP);
+		sky2_write8(hw, STAT_TX_TIMER_CTRL, TIM_START);
+	} else if (report != sky2->tx_cons) {
+		printk(KERN_INFO PFX "status report lost?\n");
+
+		spin_lock_bh(&sky2->tx_lock);
+		sky2_tx_complete(sky2, report);
+		spin_unlock_bh(&sky2->tx_lock);
+	} else {
+		printk(KERN_INFO PFX "hardware hung? flushing\n");
+
+		sky2_write32(hw, Q_ADDR(txq, Q_CSR), BMU_STOP);
+		sky2_write32(hw, Y2_QADDR(txq, PREF_UNIT_CTRL), PREF_UNIT_RST_SET);
+
+		sky2_tx_clean(sky2);
+
+		sky2_qset(hw, txq);
+		sky2_prefetch_init(hw, txq, sky2->tx_le_map, TX_RING_SIZE - 1);
+	}
 }
 
 
@@ -1730,6 +1703,7 @@ static int sky2_change_mtu(struct net_device *dev, int new_mtu)
 	struct sky2_hw *hw = sky2->hw;
 	int err;
 	u16 ctl, mode;
+	u32 imask;
 
 	if (new_mtu < ETH_ZLEN || new_mtu > ETH_JUMBO_MTU)
 		return -EINVAL;
@@ -1742,12 +1716,15 @@ static int sky2_change_mtu(struct net_device *dev, int new_mtu)
 		return 0;
 	}
 
+	imask = sky2_read32(hw, B0_IMSK);
 	sky2_write32(hw, B0_IMSK, 0);
 
 	dev->trans_start = jiffies;	/* prevent tx timeout */
 	netif_stop_queue(dev);
 	netif_poll_disable(hw->dev[0]);
 
+	synchronize_irq(hw->pdev->irq);
+
 	ctl = gma_read16(hw, sky2->port, GM_GP_CTRL);
 	gma_write16(hw, sky2->port, GM_GP_CTRL, ctl & ~GM_GPCR_RX_ENA);
 	sky2_rx_stop(sky2);
@@ -1766,7 +1743,7 @@ static int sky2_change_mtu(struct net_device *dev, int new_mtu)
 	sky2_write8(hw, RB_ADDR(rxqaddr[sky2->port], RB_CTRL), RB_ENA_OP_MD);
 
 	err = sky2_rx_start(sky2);
-	sky2_write32(hw, B0_IMSK, hw->intr_mask);
+	sky2_write32(hw, B0_IMSK, imask);
 
 	if (err)
 		dev_close(dev);
@@ -1843,8 +1820,7 @@ resubmit:
 	sky2_rx_add(sky2, re->mapaddr);
 
 	/* Tell receiver about new buffers. */
-	sky2_put_idx(sky2->hw, rxqaddr[sky2->port], sky2->rx_put,
-		     &sky2->rx_last_put, RX_LE_SIZE);
+	sky2_put_idx(sky2->hw, rxqaddr[sky2->port], sky2->rx_put);
 
 	return skb;
 
@@ -1871,76 +1847,51 @@ error:
 	goto resubmit;
 }
 
-/*
- * Check for transmit complete
- */
-#define TX_NO_STATUS	0xffff
-
-static void sky2_tx_check(struct sky2_hw *hw, int port, u16 last)
+/* Transmit complete */
+static inline void sky2_tx_done(struct net_device *dev, u16 last)
 {
-	if (last != TX_NO_STATUS) {
-		struct net_device *dev = hw->dev[port];
-		if (dev && netif_running(dev)) {
-			struct sky2_port *sky2 = netdev_priv(dev);
+	struct sky2_port *sky2 = netdev_priv(dev);
 
-			spin_lock(&sky2->tx_lock);
-			sky2_tx_complete(sky2, last);
-			spin_unlock(&sky2->tx_lock);
-		}
+	if (netif_running(dev)) {
+		spin_lock(&sky2->tx_lock);
+		sky2_tx_complete(sky2, last);
+		spin_unlock(&sky2->tx_lock);
 	}
 }
 
-/*
- * Both ports share the same status interrupt, therefore there is only
- * one poll routine.
- */
-static int sky2_poll(struct net_device *dev0, int *budget)
+/* Process status response ring */
+static int sky2_status_intr(struct sky2_hw *hw, int to_do)
 {
-	struct sky2_hw *hw = ((struct sky2_port *) netdev_priv(dev0))->hw;
-	unsigned int to_do = min(dev0->quota, *budget);
-	unsigned int work_done = 0;
-	u16 hwidx;
-	u16 tx_done[2] = { TX_NO_STATUS, TX_NO_STATUS };
-
-	sky2_write32(hw, STAT_CTRL, SC_STAT_CLR_IRQ);
-
-	/*
-	 * Kick the STAT_LEV_TIMER_CTRL timer.
-	 * This fixes my hangs on Yukon-EC (0xb6) rev 1.
-	 * The if clause is there to start the timer only if it has been
-	 * configured correctly and not been disabled via ethtool.
-	 */
-	if (sky2_read8(hw, STAT_LEV_TIMER_CTRL) == TIM_START) {
-		sky2_write8(hw, STAT_LEV_TIMER_CTRL, TIM_STOP);
-		sky2_write8(hw, STAT_LEV_TIMER_CTRL, TIM_START);
-	}
+	int work_done = 0;
 
-	hwidx = sky2_read16(hw, STAT_PUT_IDX);
-	BUG_ON(hwidx >= STATUS_RING_SIZE);
 	rmb();
 
-	while (hwidx != hw->st_idx) {
+	for(;;) {
 		struct sky2_status_le *le  = hw->st_le + hw->st_idx;
 		struct net_device *dev;
 		struct sky2_port *sky2;
 		struct sk_buff *skb;
 		u32 status;
 		u16 length;
+		u8  link, opcode;
+
+		opcode = le->opcode;
+		if (!opcode)
+			break;
+		opcode &= ~HW_OWNER;
 
-		le = hw->st_le + hw->st_idx;
 		hw->st_idx = (hw->st_idx + 1) % STATUS_RING_SIZE;
-		prefetch(hw->st_le + hw->st_idx);
+		le->opcode = 0;
 
-		BUG_ON(le->link >= 2);
-		dev = hw->dev[le->link];
-		if (dev == NULL || !netif_running(dev))
-			continue;
+		link = le->link;
+		BUG_ON(link >= 2);
+		dev = hw->dev[link];
 
 		sky2 = netdev_priv(dev);
-		status = le32_to_cpu(le->status);
-		length = le16_to_cpu(le->length);
+		length = le->length;
+		status = le->status;
 
-		switch (le->opcode & ~HW_OWNER) {
+		switch (opcode) {
 		case OP_RXSTAT:
 			skb = sky2_receive(sky2, length, status);
 			if (!skb)
@@ -1980,42 +1931,23 @@ static int sky2_poll(struct net_device *dev0, int *budget)
 
 		case OP_TXINDEXLE:
 			/* TX index reports status for both ports */
-			tx_done[0] = status & 0xffff;
-			tx_done[1] = ((status >> 24) & 0xff)
-				| (u16)(length & 0xf) << 8;
+			sky2_tx_done(hw->dev[0], status & 0xffff);
+			if (hw->dev[1])
+				sky2_tx_done(hw->dev[1],
+				     ((status >> 24) & 0xff)
+					     | (u16)(length & 0xf) << 8);
 			break;
 
 		default:
 			if (net_ratelimit())
 				printk(KERN_WARNING PFX
-				       "unknown status opcode 0x%x\n", le->opcode);
+				       "unknown status opcode 0x%x\n", opcode);
 			break;
 		}
 	}
 
 exit_loop:
-	sky2_tx_check(hw, 0, tx_done[0]);
-	sky2_tx_check(hw, 1, tx_done[1]);
-
-	if (sky2_read8(hw, STAT_TX_TIMER_CTRL) == TIM_START) {
-		sky2_write8(hw, STAT_TX_TIMER_CTRL, TIM_STOP);
-		sky2_write8(hw, STAT_TX_TIMER_CTRL, TIM_START);
-	}
-
-	if (likely(work_done < to_do)) {
-		spin_lock_irq(&hw->hw_lock);
-		__netif_rx_complete(dev0);
-
-		hw->intr_mask |= Y2_IS_STAT_BMU;
-		sky2_write32(hw, B0_IMSK, hw->intr_mask);
-		spin_unlock_irq(&hw->hw_lock);
-
-		return 0;
-	} else {
-		*budget -= work_done;
-		dev0->quota -= work_done;
-		return 1;
-	}
+	return work_done;
 }
 
 static void sky2_hw_error(struct sky2_hw *hw, unsigned port, u32 status)
@@ -2134,57 +2066,97 @@ static void sky2_mac_intr(struct sky2_hw *hw, unsigned port)
 	}
 }
 
-static void sky2_phy_intr(struct sky2_hw *hw, unsigned port)
+/* This should never happen it is a fatal situation */
+static void sky2_descriptor_error(struct sky2_hw *hw, unsigned port,
+				  const char *rxtx, u32 mask)
 {
 	struct net_device *dev = hw->dev[port];
 	struct sky2_port *sky2 = netdev_priv(dev);
+	u32 imask;
+
+	printk(KERN_ERR PFX "%s: %s descriptor error (hardware problem)\n",
+	       dev ? dev->name : "<not registered>", rxtx);
 
-	hw->intr_mask &= ~(port == 0 ? Y2_IS_IRQ_PHY1 : Y2_IS_IRQ_PHY2);
-	sky2_write32(hw, B0_IMSK, hw->intr_mask);
+	imask = sky2_read32(hw, B0_IMSK);
+	imask &= ~mask;
+	sky2_write32(hw, B0_IMSK, imask);
 
-	schedule_work(&sky2->phy_task);
+	if (dev) {
+		spin_lock(&sky2->phy_lock);
+		sky2_link_down(sky2);
+		spin_unlock(&sky2->phy_lock);
+	}
 }
 
-static irqreturn_t sky2_intr(int irq, void *dev_id, struct pt_regs *regs)
+static int sky2_poll(struct net_device *dev0, int *budget)
 {
-	struct sky2_hw *hw = dev_id;
-	struct net_device *dev0 = hw->dev[0];
-	u32 status;
+	struct sky2_hw *hw = ((struct sky2_port *) netdev_priv(dev0))->hw;
+	int work_limit = min(dev0->quota, *budget);
+	int work_done = 0;
+	u32 status = sky2_read32(hw, B0_Y2_SP_EISR);
 
-	status = sky2_read32(hw, B0_Y2_SP_ISRC2);
-	if (status == 0 || status == ~0)
-		return IRQ_NONE;
+	if (unlikely(status & ~Y2_IS_STAT_BMU)) {
+		if (status & Y2_IS_HW_ERR)
+			sky2_hw_intr(hw);
 
-	spin_lock(&hw->hw_lock);
-	if (status & Y2_IS_HW_ERR)
-		sky2_hw_intr(hw);
+		if (status & Y2_IS_IRQ_PHY1)
+			sky2_phy_intr(hw, 0);
 
-	/* Do NAPI for Rx and Tx status */
-	if (status & Y2_IS_STAT_BMU) {
-		hw->intr_mask &= ~Y2_IS_STAT_BMU;
-		sky2_write32(hw, B0_IMSK, hw->intr_mask);
+		if (status & Y2_IS_IRQ_PHY2)
+			sky2_phy_intr(hw, 1);
 
-		if (likely(__netif_rx_schedule_prep(dev0))) {
-			prefetch(&hw->st_le[hw->st_idx]);
-			__netif_rx_schedule(dev0);
-		}
+		if (status & Y2_IS_IRQ_MAC1)
+			sky2_mac_intr(hw, 0);
+
+		if (status & Y2_IS_IRQ_MAC2)
+			sky2_mac_intr(hw, 1);
+
+		if (status & Y2_IS_CHK_RX1)
+			sky2_descriptor_error(hw, 0, "receive", Y2_IS_CHK_RX1);
+
+		if (status & Y2_IS_CHK_RX2)
+			sky2_descriptor_error(hw, 1, "receive", Y2_IS_CHK_RX2);
+
+		if (status & Y2_IS_CHK_TXA1)
+			sky2_descriptor_error(hw, 0, "transmit", Y2_IS_CHK_TXA1);
+
+		if (status & Y2_IS_CHK_TXA2)
+			sky2_descriptor_error(hw, 1, "transmit", Y2_IS_CHK_TXA2);
 	}
 
-	if (status & Y2_IS_IRQ_PHY1)
-		sky2_phy_intr(hw, 0);
+	if (status & Y2_IS_STAT_BMU) {
+		work_done = sky2_status_intr(hw, work_limit);
+		*budget -= work_done;
+		dev0->quota -= work_done;
+
+		if (work_done >= work_limit)
+			return 1;
 
-	if (status & Y2_IS_IRQ_PHY2)
-		sky2_phy_intr(hw, 1);
+		sky2_write32(hw, STAT_CTRL, SC_STAT_CLR_IRQ);
+	}
 
-	if (status & Y2_IS_IRQ_MAC1)
-		sky2_mac_intr(hw, 0);
+	netif_rx_complete(dev0);
 
-	if (status & Y2_IS_IRQ_MAC2)
-		sky2_mac_intr(hw, 1);
+	status = sky2_read32(hw, B0_Y2_SP_LISR);
+	return 0;
+}
 
-	sky2_write32(hw, B0_Y2_SP_ICR, 2);
+static irqreturn_t sky2_intr(int irq, void *dev_id, struct pt_regs *regs)
+{
+	struct sky2_hw *hw = dev_id;
+	struct net_device *dev0 = hw->dev[0];
+	u32 status;
 
-	spin_unlock(&hw->hw_lock);
+	/* Reading this mask interrupts as side effect */
+	status = sky2_read32(hw, B0_Y2_SP_ISRC2);
+	if (status == 0 || status == ~0)
+		return IRQ_NONE;
+
+	prefetch(&hw->st_le[hw->st_idx]);
+	if (likely(__netif_rx_schedule_prep(dev0)))
+		__netif_rx_schedule(dev0);
+	else
+		printk(KERN_DEBUG PFX "irq race detected\n");
 
 	return IRQ_HANDLED;
 }
@@ -2238,6 +2210,23 @@ static int sky2_reset(struct sky2_hw *hw)
 		return -EOPNOTSUPP;
 	}
 
+	hw->chip_rev = (sky2_read8(hw, B2_MAC_CFG) & CFG_CHIP_R_MSK) >> 4;
+
+	/* This rev is really old, and requires untested workarounds */
+	if (hw->chip_id == CHIP_ID_YUKON_EC && hw->chip_rev == CHIP_REV_YU_EC_A1) {
+		printk(KERN_ERR PFX "%s: unsupported revision Yukon-%s (0x%x) rev %d\n",
+		       pci_name(hw->pdev), yukon2_name[hw->chip_id - CHIP_ID_YUKON_XL],
+		       hw->chip_id, hw->chip_rev);
+		return -EOPNOTSUPP;
+	}
+
+	/* This chip is new and not tested yet */
+	if (hw->chip_id == CHIP_ID_YUKON_EC_U) {
+		pr_info(PFX "%s: is a version of Yukon 2 chipset that has not been tested yet.\n",
+			pci_name(hw->pdev));
+		pr_info("Please report success/failure to maintainer <shemminger@osdl.org>\n");
+	}
+
 	/* disable ASF */
 	if (hw->chip_id <= CHIP_ID_YUKON_EC) {
 		sky2_write8(hw, B28_Y2_ASF_STAT_CMD, Y2_ASF_RESET);
@@ -2258,7 +2247,7 @@ static int sky2_reset(struct sky2_hw *hw)
 	sky2_write8(hw, B0_CTST, CS_MRST_CLR);
 
 	/* clear any PEX errors */
-	if (pci_find_capability(hw->pdev, PCI_CAP_ID_EXP)) 
+	if (pci_find_capability(hw->pdev, PCI_CAP_ID_EXP))
 		sky2_pci_write32(hw, PEX_UNC_ERR_STAT, 0xffffffffUL);
 
 
@@ -2271,7 +2260,6 @@ static int sky2_reset(struct sky2_hw *hw)
 		if (!(sky2_read8(hw, B2_Y2_CLK_GATE) & Y2_STATUS_LNK2_INAC))
 			++hw->ports;
 	}
-	hw->chip_rev = (sky2_read8(hw, B2_MAC_CFG) & CFG_CHIP_R_MSK) >> 4;
 
 	sky2_set_power_state(hw, PCI_D0);
 
@@ -2337,30 +2325,18 @@ static int sky2_reset(struct sky2_hw *hw)
 	/* Set the list last index */
 	sky2_write16(hw, STAT_LAST_IDX, STATUS_RING_SIZE - 1);
 
-	/* These status setup values are copied from SysKonnect's driver */
-	if (is_ec_a1(hw)) {
-		/* WA for dev. #4.3 */
-		sky2_write16(hw, STAT_TX_IDX_TH, 0xfff);	/* Tx Threshold */
-
-		/* set Status-FIFO watermark */
-		sky2_write8(hw, STAT_FIFO_WM, 0x21);	/* WA for dev. #4.18 */
+	sky2_write16(hw, STAT_TX_IDX_TH, 10);
+	sky2_write8(hw, STAT_FIFO_WM, 16);
 
-		/* set Status-FIFO ISR watermark */
-		sky2_write8(hw, STAT_FIFO_ISR_WM, 0x07);	/* WA for dev. #4.18 */
-		sky2_write32(hw, STAT_TX_TIMER_INI, sky2_us2clk(hw, 10000));
-	} else {
-		sky2_write16(hw, STAT_TX_IDX_TH, 10);
-		sky2_write8(hw, STAT_FIFO_WM, 16);
-
-		/* set Status-FIFO ISR watermark */
-		if (hw->chip_id == CHIP_ID_YUKON_XL && hw->chip_rev == 0)
-			sky2_write8(hw, STAT_FIFO_ISR_WM, 4);
-		else
-			sky2_write8(hw, STAT_FIFO_ISR_WM, 16);
+	/* set Status-FIFO ISR watermark */
+	if (hw->chip_id == CHIP_ID_YUKON_XL && hw->chip_rev == 0)
+		sky2_write8(hw, STAT_FIFO_ISR_WM, 4);
+	else
+		sky2_write8(hw, STAT_FIFO_ISR_WM, 16);
 
-		sky2_write32(hw, STAT_TX_TIMER_INI, sky2_us2clk(hw, 1000));
-		sky2_write32(hw, STAT_ISR_TIMER_INI, sky2_us2clk(hw, 7));
-	}
+	sky2_write32(hw, STAT_TX_TIMER_INI, sky2_us2clk(hw, 1000));
+	sky2_write32(hw, STAT_ISR_TIMER_INI, sky2_us2clk(hw, 20));
+	sky2_write32(hw, STAT_LEV_TIMER_INI, sky2_us2clk(hw, 100));
 
 	/* enable status unit */
 	sky2_write32(hw, STAT_CTRL, SC_STAT_OP_ON);
@@ -2743,7 +2719,7 @@ static int sky2_phys_id(struct net_device *dev, u32 data)
 		ms = data * 1000;
 
 	/* save initial values */
-	down(&sky2->phy_sema);
+	spin_lock_bh(&sky2->phy_lock);
 	if (hw->chip_id == CHIP_ID_YUKON_XL) {
 		u16 pg = gm_phy_read(hw, port, PHY_MARV_EXT_ADR);
 		gm_phy_write(hw, port, PHY_MARV_EXT_ADR, 3);
@@ -2759,9 +2735,9 @@ static int sky2_phys_id(struct net_device *dev, u32 data)
 		sky2_led(hw, port, onoff);
 		onoff = !onoff;
 
-		up(&sky2->phy_sema);
+		spin_unlock_bh(&sky2->phy_lock);
 		interrupted = msleep_interruptible(250);
-		down(&sky2->phy_sema);
+		spin_lock_bh(&sky2->phy_lock);
 
 		ms -= 250;
 	}
@@ -2776,7 +2752,7 @@ static int sky2_phys_id(struct net_device *dev, u32 data)
 		gm_phy_write(hw, port, PHY_MARV_LED_CTRL, ledctrl);
 		gm_phy_write(hw, port, PHY_MARV_LED_OVER, ledover);
 	}
-	up(&sky2->phy_sema);
+	spin_unlock_bh(&sky2->phy_lock);
 
 	return 0;
 }
@@ -2806,38 +2782,6 @@ static int sky2_set_pauseparam(struct net_device *dev,
 	return err;
 }
 
-#ifdef CONFIG_PM
-static void sky2_get_wol(struct net_device *dev, struct ethtool_wolinfo *wol)
-{
-	struct sky2_port *sky2 = netdev_priv(dev);
-
-	wol->supported = WAKE_MAGIC;
-	wol->wolopts = sky2->wol ? WAKE_MAGIC : 0;
-}
-
-static int sky2_set_wol(struct net_device *dev, struct ethtool_wolinfo *wol)
-{
-	struct sky2_port *sky2 = netdev_priv(dev);
-	struct sky2_hw *hw = sky2->hw;
-
-	if (wol->wolopts != WAKE_MAGIC && wol->wolopts != 0)
-		return -EOPNOTSUPP;
-
-	sky2->wol = wol->wolopts == WAKE_MAGIC;
-
-	if (sky2->wol) {
-		memcpy_toio(hw->regs + WOL_MAC_ADDR, dev->dev_addr, ETH_ALEN);
-
-		sky2_write16(hw, WOL_CTRL_STAT,
-			     WOL_CTL_ENA_PME_ON_MAGIC_PKT |
-			     WOL_CTL_ENA_MAGIC_PKT_UNIT);
-	} else
-		sky2_write16(hw, WOL_CTRL_STAT, WOL_CTL_DEFAULT);
-
-	return 0;
-}
-#endif
-
 static int sky2_get_coalesce(struct net_device *dev,
 			     struct ethtool_coalesce *ecmd)
 {
@@ -2878,19 +2822,11 @@ static int sky2_set_coalesce(struct net_device *dev,
 {
 	struct sky2_port *sky2 = netdev_priv(dev);
 	struct sky2_hw *hw = sky2->hw;
-	const u32 tmin = sky2_clk2us(hw, 1);
-	const u32 tmax = 5000;
-
-	if (ecmd->tx_coalesce_usecs != 0 &&
-	    (ecmd->tx_coalesce_usecs < tmin || ecmd->tx_coalesce_usecs > tmax))
-		return -EINVAL;
-
-	if (ecmd->rx_coalesce_usecs != 0 &&
-	    (ecmd->rx_coalesce_usecs < tmin || ecmd->rx_coalesce_usecs > tmax))
-		return -EINVAL;
+	const u32 tmax = sky2_clk2us(hw, 0x0ffffff);
 
-	if (ecmd->rx_coalesce_usecs_irq != 0 &&
-	    (ecmd->rx_coalesce_usecs_irq < tmin || ecmd->rx_coalesce_usecs_irq > tmax))
+	if (ecmd->tx_coalesce_usecs > tmax ||
+	    ecmd->rx_coalesce_usecs > tmax ||
+	    ecmd->rx_coalesce_usecs_irq > tmax)
 		return -EINVAL;
 
 	if (ecmd->tx_max_coalesced_frames >= TX_RING_SIZE-1)
@@ -3025,10 +2961,6 @@ static struct ethtool_ops sky2_ethtool_ops = {
 	.set_ringparam = sky2_set_ringparam,
 	.get_pauseparam = sky2_get_pauseparam,
 	.set_pauseparam = sky2_set_pauseparam,
-#ifdef CONFIG_PM
-	.get_wol = sky2_get_wol,
-	.set_wol = sky2_set_wol,
-#endif
 	.phys_id = sky2_phys_id,
 	.get_stats_count = sky2_get_stats_count,
 	.get_ethtool_stats = sky2_get_ethtool_stats,
@@ -3082,16 +3014,15 @@ static __devinit struct net_device *sky2_init_netdev(struct sky2_hw *hw,
 	sky2->speed = -1;
 	sky2->advertising = sky2_supported_modes(hw);
 
- 	/* Receive checksum disabled for Yukon XL
+	/* Receive checksum disabled for Yukon XL
 	 * because of observed problems with incorrect
 	 * values when multiple packets are received in one interrupt
 	 */
 	sky2->rx_csum = (hw->chip_id != CHIP_ID_YUKON_XL);
 
-	INIT_WORK(&sky2->phy_task, sky2_phy_task, sky2);
-	init_MUTEX(&sky2->phy_sema);
+	spin_lock_init(&sky2->phy_lock);
 	sky2->tx_pending = TX_DEF_PENDING;
-	sky2->rx_pending = is_ec_a1(hw) ? 8 : RX_DEF_PENDING;
+	sky2->rx_pending = RX_DEF_PENDING;
 	sky2->rx_bufsize = sky2_buf_size(ETH_DATA_LEN);
 
 	hw->dev[port] = dev;
@@ -3133,6 +3064,66 @@ static void __devinit sky2_show_addr(struct net_device *dev)
 		       dev->dev_addr[3], dev->dev_addr[4], dev->dev_addr[5]);
 }
 
+/* Handle software interrupt used during MSI test */
+static irqreturn_t __devinit sky2_test_intr(int irq, void *dev_id,
+					    struct pt_regs *regs)
+{
+	struct sky2_hw *hw = dev_id;
+	u32 status = sky2_read32(hw, B0_Y2_SP_ISRC2);
+
+	if (status == 0)
+		return IRQ_NONE;
+
+	if (status & Y2_IS_IRQ_SW) {
+		hw->msi_detected = 1;
+		wake_up(&hw->msi_wait);
+		sky2_write8(hw, B0_CTST, CS_CL_SW_IRQ);
+	}
+	sky2_write32(hw, B0_Y2_SP_ICR, 2);
+
+	return IRQ_HANDLED;
+}
+
+/* Test interrupt path by forcing a a software IRQ */
+static int __devinit sky2_test_msi(struct sky2_hw *hw)
+{
+	struct pci_dev *pdev = hw->pdev;
+	int err;
+
+	sky2_write32(hw, B0_IMSK, Y2_IS_IRQ_SW);
+
+	err = request_irq(pdev->irq, sky2_test_intr, SA_SHIRQ, DRV_NAME, hw);
+	if (err) {
+		printk(KERN_ERR PFX "%s: cannot assign irq %d\n",
+		       pci_name(pdev), pdev->irq);
+		return err;
+	}
+
+	init_waitqueue_head (&hw->msi_wait);
+
+	sky2_write8(hw, B0_CTST, CS_ST_SW_IRQ);
+	wmb();
+
+	wait_event_timeout(hw->msi_wait, hw->msi_detected, HZ/10);
+
+	if (!hw->msi_detected) {
+		/* MSI test failed, go back to INTx mode */
+		printk(KERN_WARNING PFX "%s: No interrupt was generated using MSI, "
+		       "switching to INTx mode. Please report this failure to "
+		       "the PCI maintainer and include system chipset information.\n",
+		       pci_name(pdev));
+
+		err = -EOPNOTSUPP;
+		sky2_write8(hw, B0_CTST, CS_CL_SW_IRQ);
+	}
+
+	sky2_write32(hw, B0_IMSK, 0);
+
+	free_irq(pdev->irq, hw);
+
+	return err;
+}
+
 static int __devinit sky2_probe(struct pci_dev *pdev,
 				const struct pci_device_id *ent)
 {
@@ -3201,7 +3192,6 @@ static int __devinit sky2_probe(struct pci_dev *pdev,
 		goto err_out_free_hw;
 	}
 	hw->pm_cap = pm_cap;
-	spin_lock_init(&hw->hw_lock);
 
 #ifdef __BIG_ENDIAN
 	/* byte swap descriptors in hardware */
@@ -3254,21 +3244,29 @@ static int __devinit sky2_probe(struct pci_dev *pdev,
 		}
 	}
 
-	err = request_irq(pdev->irq, sky2_intr, SA_SHIRQ, DRV_NAME, hw);
+	if (!disable_msi && pci_enable_msi(pdev) == 0) {
+		err = sky2_test_msi(hw);
+		if (err == -EOPNOTSUPP)
+ 			pci_disable_msi(pdev);
+		else if (err)
+			goto err_out_unregister;
+ 	}
+
+	err = request_irq(pdev->irq,  sky2_intr, SA_SHIRQ, DRV_NAME, hw);
 	if (err) {
 		printk(KERN_ERR PFX "%s: cannot assign irq %d\n",
 		       pci_name(pdev), pdev->irq);
 		goto err_out_unregister;
 	}
 
-	hw->intr_mask = Y2_IS_BASE;
-	sky2_write32(hw, B0_IMSK, hw->intr_mask);
+	sky2_write32(hw, B0_IMSK, Y2_IS_BASE);
 
 	pci_set_drvdata(pdev, hw);
 
 	return 0;
 
 err_out_unregister:
+	pci_disable_msi(pdev);
 	if (dev1) {
 		unregister_netdev(dev1);
 		free_netdev(dev1);
@@ -3311,6 +3309,7 @@ static void __devexit sky2_remove(struct pci_dev *pdev)
 	sky2_read8(hw, B0_CTST);
 
 	free_irq(pdev->irq, hw);
+	pci_disable_msi(pdev);
 	pci_free_consistent(pdev, STATUS_LE_BYTES, hw->st_le, hw->st_dma);
 	pci_release_regions(pdev);
 	pci_disable_device(pdev);
diff --git a/drivers/net/sky2.h b/drivers/net/sky2.h
index dce955c76f3..d63cd5a1b71 100644
--- a/drivers/net/sky2.h
+++ b/drivers/net/sky2.h
@@ -278,13 +278,11 @@ enum {
 	Y2_IS_CHK_TXS1	= 1<<1,		/* Descriptor error TXS 1 */
 	Y2_IS_CHK_TXA1	= 1<<0,		/* Descriptor error TXA 1 */
 
-	Y2_IS_BASE	= Y2_IS_HW_ERR | Y2_IS_STAT_BMU |
-			  Y2_IS_POLL_CHK | Y2_IS_TWSI_RDY |
-			  Y2_IS_IRQ_SW | Y2_IS_TIMINT,
-	Y2_IS_PORT_1	= Y2_IS_IRQ_PHY1 | Y2_IS_IRQ_MAC1 |
-			  Y2_IS_CHK_RX1 | Y2_IS_CHK_TXA1 | Y2_IS_CHK_TXS1,
-	Y2_IS_PORT_2	= Y2_IS_IRQ_PHY2 | Y2_IS_IRQ_MAC2 |
-			  Y2_IS_CHK_RX2 | Y2_IS_CHK_TXA2 | Y2_IS_CHK_TXS2,
+	Y2_IS_BASE	= Y2_IS_HW_ERR | Y2_IS_STAT_BMU,
+	Y2_IS_PORT_1	= Y2_IS_IRQ_PHY1 | Y2_IS_IRQ_MAC1
+		          | Y2_IS_CHK_TXA1 | Y2_IS_CHK_RX1,
+	Y2_IS_PORT_2	= Y2_IS_IRQ_PHY2 | Y2_IS_IRQ_MAC2
+			  | Y2_IS_CHK_TXA2 | Y2_IS_CHK_RX2,
 };
 
 /*	B2_IRQM_HWE_MSK	32 bit	IRQ Moderation HW Error Mask */
@@ -1832,6 +1830,7 @@ struct sky2_port {
 	struct net_device    *netdev;
 	unsigned	     port;
 	u32		     msg_enable;
+	spinlock_t	     phy_lock;
 
 	spinlock_t	     tx_lock  ____cacheline_aligned_in_smp;
 	struct tx_ring_info  *tx_ring;
@@ -1840,7 +1839,6 @@ struct sky2_port {
 	u16		     tx_prod;		/* next le to use */
 	u32		     tx_addr64;
 	u16		     tx_pending;
-	u16		     tx_last_put;
 	u16		     tx_last_mss;
 
 	struct ring_info     *rx_ring ____cacheline_aligned_in_smp;
@@ -1849,7 +1847,6 @@ struct sky2_port {
 	u16		     rx_next;		/* next re to check */
 	u16		     rx_put;		/* next le index to use */
 	u16		     rx_pending;
-	u16		     rx_last_put;
 	u16		     rx_bufsize;
 #ifdef SKY2_VLAN_TAG_USED
 	u16		     rx_tag;
@@ -1865,20 +1862,15 @@ struct sky2_port {
 	u8		     rx_pause;
 	u8		     tx_pause;
 	u8		     rx_csum;
-	u8		     wol;
 
 	struct net_device_stats net_stats;
 
-	struct work_struct   phy_task;
-	struct semaphore     phy_sema;
 };
 
 struct sky2_hw {
 	void __iomem  	     *regs;
 	struct pci_dev	     *pdev;
 	struct net_device    *dev[2];
-	spinlock_t	     hw_lock;
-	u32		     intr_mask;
 
 	int		     pm_cap;
 	u8	     	     chip_id;
@@ -1889,6 +1881,8 @@ struct sky2_hw {
 	struct sky2_status_le *st_le;
 	u32		     st_idx;
 	dma_addr_t   	     st_dma;
+	int		     msi_detected;
+	wait_queue_head_t    msi_wait;
 };
 
 /* Register accessor for memory mapped device */
diff --git a/drivers/net/smc91x.c b/drivers/net/smc91x.c
index 75e9b3b910c..0e9833adf9f 100644
--- a/drivers/net/smc91x.c
+++ b/drivers/net/smc91x.c
@@ -215,15 +215,12 @@ struct smc_local {
 
 	spinlock_t lock;
 
-#ifdef SMC_CAN_USE_DATACS
-	u32	__iomem *datacs;
-#endif
-
 #ifdef SMC_USE_PXA_DMA
 	/* DMA needs the physical address of the chip */
 	u_long physaddr;
 #endif
 	void __iomem *base;
+	void __iomem *datacs;
 };
 
 #if SMC_DEBUG > 0
@@ -2104,9 +2101,8 @@ static int smc_enable_device(struct platform_device *pdev)
 	 * Set the appropriate byte/word mode.
 	 */
 	ecsr = readb(addr + (ECSR << SMC_IO_SHIFT)) & ~ECSR_IOIS8;
-#ifndef SMC_CAN_USE_16BIT
-	ecsr |= ECSR_IOIS8;
-#endif
+	if (!SMC_CAN_USE_16BIT)
+		ecsr |= ECSR_IOIS8;
 	writeb(ecsr, addr + (ECSR << SMC_IO_SHIFT));
 	local_irq_restore(flags);
 
@@ -2143,40 +2139,39 @@ static void smc_release_attrib(struct platform_device *pdev)
 		release_mem_region(res->start, ATTRIB_SIZE);
 }
 
-#ifdef SMC_CAN_USE_DATACS
-static void smc_request_datacs(struct platform_device *pdev, struct net_device *ndev)
+static inline void smc_request_datacs(struct platform_device *pdev, struct net_device *ndev)
 {
-	struct resource * res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "smc91x-data32");
-	struct smc_local *lp = netdev_priv(ndev);
+	if (SMC_CAN_USE_DATACS) {
+		struct resource * res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "smc91x-data32");
+		struct smc_local *lp = netdev_priv(ndev);
 
-	if (!res)
-		return;
+		if (!res)
+			return;
 
-	if(!request_mem_region(res->start, SMC_DATA_EXTENT, CARDNAME)) {
-		printk(KERN_INFO "%s: failed to request datacs memory region.\n", CARDNAME);
-		return;
-	}
+		if(!request_mem_region(res->start, SMC_DATA_EXTENT, CARDNAME)) {
+			printk(KERN_INFO "%s: failed to request datacs memory region.\n", CARDNAME);
+			return;
+		}
 
-	lp->datacs = ioremap(res->start, SMC_DATA_EXTENT);
+		lp->datacs = ioremap(res->start, SMC_DATA_EXTENT);
+	}
 }
 
 static void smc_release_datacs(struct platform_device *pdev, struct net_device *ndev)
 {
-	struct smc_local *lp = netdev_priv(ndev);
-	struct resource * res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "smc91x-data32");
+	if (SMC_CAN_USE_DATACS) {
+		struct smc_local *lp = netdev_priv(ndev);
+		struct resource * res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "smc91x-data32");
 
-	if (lp->datacs)
-		iounmap(lp->datacs);
+		if (lp->datacs)
+			iounmap(lp->datacs);
 
-	lp->datacs = NULL;
+		lp->datacs = NULL;
 
-	if (res)
-		release_mem_region(res->start, SMC_DATA_EXTENT);
+		if (res)
+			release_mem_region(res->start, SMC_DATA_EXTENT);
+	}
 }
-#else
-static void smc_request_datacs(struct platform_device *pdev, struct net_device *ndev) {}
-static void smc_release_datacs(struct platform_device *pdev, struct net_device *ndev) {}
-#endif
 
 /*
  * smc_init(void)
diff --git a/drivers/net/smc91x.h b/drivers/net/smc91x.h
index e0efd1964e7..e1be1af5120 100644
--- a/drivers/net/smc91x.h
+++ b/drivers/net/smc91x.h
@@ -275,7 +275,10 @@ SMC_outw(u16 val, void __iomem *ioaddr, int reg)
 #define SMC_insw(a,r,p,l)	readsw ((void*) ((a) + (r)), p, l)
 #define SMC_outw(v,a,r)	     ({ writew ((v), (a) + (r)); LPD7A40X_IOBARRIER; })
 
-static inline void SMC_outsw (unsigned long a, int r, unsigned char* p, int l)
+#define SMC_outsw		LPD7A40X_SMC_outsw
+
+static inline void LPD7A40X_SMC_outsw(unsigned long a, int r,
+				     unsigned char* p, int l)
 {
 	unsigned short* ps = (unsigned short*) p;
 	while (l-- > 0) {
@@ -342,10 +345,6 @@ static inline void SMC_outsw (unsigned long a, int r, unsigned char* p, int l)
 
 #endif
 
-#ifndef	SMC_IRQ_FLAGS
-#define	SMC_IRQ_FLAGS		SA_TRIGGER_RISING
-#endif
-
 #ifdef SMC_USE_PXA_DMA
 /*
  * Let's use the DMA engine on the XScale PXA2xx for RX packets. This is
@@ -441,10 +440,85 @@ smc_pxa_dma_irq(int dma, void *dummy, struct pt_regs *regs)
 #endif  /* SMC_USE_PXA_DMA */
 
 
-/* Because of bank switching, the LAN91x uses only 16 I/O ports */
+/*
+ * Everything a particular hardware setup needs should have been defined
+ * at this point.  Add stubs for the undefined cases, mainly to avoid
+ * compilation warnings since they'll be optimized away, or to prevent buggy
+ * use of them.
+ */
+
+#if ! SMC_CAN_USE_32BIT
+#define SMC_inl(ioaddr, reg)		({ BUG(); 0; })
+#define SMC_outl(x, ioaddr, reg)	BUG()
+#define SMC_insl(a, r, p, l)		BUG()
+#define SMC_outsl(a, r, p, l)		BUG()
+#endif
+
+#if !defined(SMC_insl) || !defined(SMC_outsl)
+#define SMC_insl(a, r, p, l)		BUG()
+#define SMC_outsl(a, r, p, l)		BUG()
+#endif
+
+#if ! SMC_CAN_USE_16BIT
+
+/*
+ * Any 16-bit access is performed with two 8-bit accesses if the hardware
+ * can't do it directly. Most registers are 16-bit so those are mandatory.
+ */
+#define SMC_outw(x, ioaddr, reg)					\
+	do {								\
+		unsigned int __val16 = (x);				\
+		SMC_outb( __val16, ioaddr, reg );			\
+		SMC_outb( __val16 >> 8, ioaddr, reg + (1 << SMC_IO_SHIFT));\
+	} while (0)
+#define SMC_inw(ioaddr, reg)						\
+	({								\
+		unsigned int __val16;					\
+		__val16 =  SMC_inb( ioaddr, reg );			\
+		__val16 |= SMC_inb( ioaddr, reg + (1 << SMC_IO_SHIFT)) << 8; \
+		__val16;						\
+	})
+
+#define SMC_insw(a, r, p, l)		BUG()
+#define SMC_outsw(a, r, p, l)		BUG()
+
+#endif
+
+#if !defined(SMC_insw) || !defined(SMC_outsw)
+#define SMC_insw(a, r, p, l)		BUG()
+#define SMC_outsw(a, r, p, l)		BUG()
+#endif
+
+#if ! SMC_CAN_USE_8BIT
+#define SMC_inb(ioaddr, reg)		({ BUG(); 0; })
+#define SMC_outb(x, ioaddr, reg)	BUG()
+#define SMC_insb(a, r, p, l)		BUG()
+#define SMC_outsb(a, r, p, l)		BUG()
+#endif
+
+#if !defined(SMC_insb) || !defined(SMC_outsb)
+#define SMC_insb(a, r, p, l)		BUG()
+#define SMC_outsb(a, r, p, l)		BUG()
+#endif
+
+#ifndef SMC_CAN_USE_DATACS
+#define SMC_CAN_USE_DATACS	0
+#endif
+
 #ifndef SMC_IO_SHIFT
 #define SMC_IO_SHIFT	0
 #endif
+
+#ifndef	SMC_IRQ_FLAGS
+#define	SMC_IRQ_FLAGS		SA_TRIGGER_RISING
+#endif
+
+#ifndef SMC_INTERRUPT_PREAMBLE
+#define SMC_INTERRUPT_PREAMBLE
+#endif
+
+
+/* Because of bank switching, the LAN91x uses only 16 I/O ports */
 #define SMC_IO_EXTENT	(16 << SMC_IO_SHIFT)
 #define SMC_DATA_EXTENT (4)
 
@@ -817,6 +891,11 @@ static const char * chip_ids[ 16 ] =  {
  * Note: the following macros do *not* select the bank -- this must
  * be done separately as needed in the main code.  The SMC_REG() macro
  * only uses the bank argument for debugging purposes (when enabled).
+ *
+ * Note: despite inline functions being safer, everything leading to this
+ * should preferably be macros to let BUG() display the line number in
+ * the core source code since we're interested in the top call site
+ * not in any inline function location.
  */
 
 #if SMC_DEBUG > 0
@@ -834,62 +913,142 @@ static const char * chip_ids[ 16 ] =  {
 #define SMC_REG(reg, bank)	(reg<<SMC_IO_SHIFT)
 #endif
 
-#if SMC_CAN_USE_8BIT
-#define SMC_GET_PN()		SMC_inb( ioaddr, PN_REG )
-#define SMC_SET_PN(x)		SMC_outb( x, ioaddr, PN_REG )
-#define SMC_GET_AR()		SMC_inb( ioaddr, AR_REG )
-#define SMC_GET_TXFIFO()	SMC_inb( ioaddr, TXFIFO_REG )
-#define SMC_GET_RXFIFO()	SMC_inb( ioaddr, RXFIFO_REG )
-#define SMC_GET_INT()		SMC_inb( ioaddr, INT_REG )
-#define SMC_ACK_INT(x)		SMC_outb( x, ioaddr, INT_REG )
-#define SMC_GET_INT_MASK()	SMC_inb( ioaddr, IM_REG )
-#define SMC_SET_INT_MASK(x)	SMC_outb( x, ioaddr, IM_REG )
-#else
-#define SMC_GET_PN()		(SMC_inw( ioaddr, PN_REG ) & 0xFF)
-#define SMC_SET_PN(x)		SMC_outw( x, ioaddr, PN_REG )
-#define SMC_GET_AR()		(SMC_inw( ioaddr, PN_REG ) >> 8)
-#define SMC_GET_TXFIFO()	(SMC_inw( ioaddr, TXFIFO_REG ) & 0xFF)
-#define SMC_GET_RXFIFO()	(SMC_inw( ioaddr, TXFIFO_REG ) >> 8)
-#define SMC_GET_INT()		(SMC_inw( ioaddr, INT_REG ) & 0xFF)
+/*
+ * Hack Alert: Some setups just can't write 8 or 16 bits reliably when not
+ * aligned to a 32 bit boundary.  I tell you that does exist!
+ * Fortunately the affected register accesses can be easily worked around
+ * since we can write zeroes to the preceeding 16 bits without adverse
+ * effects and use a 32-bit access.
+ *
+ * Enforce it on any 32-bit capable setup for now.
+ */
+#define SMC_MUST_ALIGN_WRITE	SMC_CAN_USE_32BIT
+
+#define SMC_GET_PN()							\
+	( SMC_CAN_USE_8BIT	? (SMC_inb(ioaddr, PN_REG))		\
+				: (SMC_inw(ioaddr, PN_REG) & 0xFF) )
+
+#define SMC_SET_PN(x)							\
+	do {								\
+		if (SMC_MUST_ALIGN_WRITE)				\
+			SMC_outl((x)<<16, ioaddr, SMC_REG(0, 2));	\
+		else if (SMC_CAN_USE_8BIT)				\
+			SMC_outb(x, ioaddr, PN_REG);			\
+		else							\
+			SMC_outw(x, ioaddr, PN_REG);			\
+	} while (0)
+
+#define SMC_GET_AR()							\
+	( SMC_CAN_USE_8BIT	? (SMC_inb(ioaddr, AR_REG))		\
+	  			: (SMC_inw(ioaddr, PN_REG) >> 8) )
+
+#define SMC_GET_TXFIFO()						\
+	( SMC_CAN_USE_8BIT	? (SMC_inb(ioaddr, TXFIFO_REG))		\
+				: (SMC_inw(ioaddr, TXFIFO_REG) & 0xFF) )
+
+#define SMC_GET_RXFIFO()						\
+	  ( SMC_CAN_USE_8BIT	? (SMC_inb(ioaddr, RXFIFO_REG))		\
+				: (SMC_inw(ioaddr, TXFIFO_REG) >> 8) )
+
+#define SMC_GET_INT()							\
+	( SMC_CAN_USE_8BIT	? (SMC_inb(ioaddr, INT_REG))		\
+				: (SMC_inw(ioaddr, INT_REG) & 0xFF) )
+
 #define SMC_ACK_INT(x)							\
 	do {								\
-		unsigned long __flags;					\
-		int __mask;						\
-		local_irq_save(__flags);				\
-		__mask = SMC_inw( ioaddr, INT_REG ) & ~0xff;		\
-		SMC_outw( __mask | (x), ioaddr, INT_REG );		\
-		local_irq_restore(__flags);				\
+		if (SMC_CAN_USE_8BIT)					\
+			SMC_outb(x, ioaddr, INT_REG);			\
+		else {							\
+			unsigned long __flags;				\
+			int __mask;					\
+			local_irq_save(__flags);			\
+			__mask = SMC_inw( ioaddr, INT_REG ) & ~0xff;	\
+			SMC_outw( __mask | (x), ioaddr, INT_REG );	\
+			local_irq_restore(__flags);			\
+		}							\
+	} while (0)
+
+#define SMC_GET_INT_MASK()						\
+	( SMC_CAN_USE_8BIT	? (SMC_inb(ioaddr, IM_REG))		\
+				: (SMC_inw( ioaddr, INT_REG ) >> 8) )
+
+#define SMC_SET_INT_MASK(x)						\
+	do {								\
+		if (SMC_CAN_USE_8BIT)					\
+			SMC_outb(x, ioaddr, IM_REG);			\
+		else							\
+			SMC_outw((x) << 8, ioaddr, INT_REG);		\
+	} while (0)
+
+#define SMC_CURRENT_BANK()	SMC_inw(ioaddr, BANK_SELECT)
+
+#define SMC_SELECT_BANK(x)						\
+	do {								\
+		if (SMC_MUST_ALIGN_WRITE)				\
+			SMC_outl((x)<<16, ioaddr, 12<<SMC_IO_SHIFT);	\
+		else							\
+			SMC_outw(x, ioaddr, BANK_SELECT);		\
+	} while (0)
+
+#define SMC_GET_BASE()		SMC_inw(ioaddr, BASE_REG)
+
+#define SMC_SET_BASE(x)		SMC_outw(x, ioaddr, BASE_REG)
+
+#define SMC_GET_CONFIG()	SMC_inw(ioaddr, CONFIG_REG)
+
+#define SMC_SET_CONFIG(x)	SMC_outw(x, ioaddr, CONFIG_REG)
+
+#define SMC_GET_COUNTER()	SMC_inw(ioaddr, COUNTER_REG)
+
+#define SMC_GET_CTL()		SMC_inw(ioaddr, CTL_REG)
+
+#define SMC_SET_CTL(x)		SMC_outw(x, ioaddr, CTL_REG)
+
+#define SMC_GET_MII()		SMC_inw(ioaddr, MII_REG)
+
+#define SMC_SET_MII(x)		SMC_outw(x, ioaddr, MII_REG)
+
+#define SMC_GET_MIR()		SMC_inw(ioaddr, MIR_REG)
+
+#define SMC_SET_MIR(x)		SMC_outw(x, ioaddr, MIR_REG)
+
+#define SMC_GET_MMU_CMD()	SMC_inw(ioaddr, MMU_CMD_REG)
+
+#define SMC_SET_MMU_CMD(x)	SMC_outw(x, ioaddr, MMU_CMD_REG)
+
+#define SMC_GET_FIFO()		SMC_inw(ioaddr, FIFO_REG)
+
+#define SMC_GET_PTR()		SMC_inw(ioaddr, PTR_REG)
+
+#define SMC_SET_PTR(x)							\
+	do {								\
+		if (SMC_MUST_ALIGN_WRITE)				\
+			SMC_outl((x)<<16, ioaddr, SMC_REG(4, 2));	\
+		else							\
+			SMC_outw(x, ioaddr, PTR_REG);			\
 	} while (0)
-#define SMC_GET_INT_MASK()	(SMC_inw( ioaddr, INT_REG ) >> 8)
-#define SMC_SET_INT_MASK(x)	SMC_outw( (x) << 8, ioaddr, INT_REG )
-#endif
 
-#define SMC_CURRENT_BANK()	SMC_inw( ioaddr, BANK_SELECT )
-#define SMC_SELECT_BANK(x)	SMC_outw( x, ioaddr, BANK_SELECT )
-#define SMC_GET_BASE()		SMC_inw( ioaddr, BASE_REG )
-#define SMC_SET_BASE(x)		SMC_outw( x, ioaddr, BASE_REG )
-#define SMC_GET_CONFIG()	SMC_inw( ioaddr, CONFIG_REG )
-#define SMC_SET_CONFIG(x)	SMC_outw( x, ioaddr, CONFIG_REG )
-#define SMC_GET_COUNTER()	SMC_inw( ioaddr, COUNTER_REG )
-#define SMC_GET_CTL()		SMC_inw( ioaddr, CTL_REG )
-#define SMC_SET_CTL(x)		SMC_outw( x, ioaddr, CTL_REG )
-#define SMC_GET_MII()		SMC_inw( ioaddr, MII_REG )
-#define SMC_SET_MII(x)		SMC_outw( x, ioaddr, MII_REG )
-#define SMC_GET_MIR()		SMC_inw( ioaddr, MIR_REG )
-#define SMC_SET_MIR(x)		SMC_outw( x, ioaddr, MIR_REG )
-#define SMC_GET_MMU_CMD()	SMC_inw( ioaddr, MMU_CMD_REG )
-#define SMC_SET_MMU_CMD(x)	SMC_outw( x, ioaddr, MMU_CMD_REG )
-#define SMC_GET_FIFO()		SMC_inw( ioaddr, FIFO_REG )
-#define SMC_GET_PTR()		SMC_inw( ioaddr, PTR_REG )
-#define SMC_SET_PTR(x)		SMC_outw( x, ioaddr, PTR_REG )
-#define SMC_GET_EPH_STATUS()	SMC_inw( ioaddr, EPH_STATUS_REG )
-#define SMC_GET_RCR()		SMC_inw( ioaddr, RCR_REG )
-#define SMC_SET_RCR(x)		SMC_outw( x, ioaddr, RCR_REG )
-#define SMC_GET_REV()		SMC_inw( ioaddr, REV_REG )
-#define SMC_GET_RPC()		SMC_inw( ioaddr, RPC_REG )
-#define SMC_SET_RPC(x)		SMC_outw( x, ioaddr, RPC_REG )
-#define SMC_GET_TCR()		SMC_inw( ioaddr, TCR_REG )
-#define SMC_SET_TCR(x)		SMC_outw( x, ioaddr, TCR_REG )
+#define SMC_GET_EPH_STATUS()	SMC_inw(ioaddr, EPH_STATUS_REG)
+
+#define SMC_GET_RCR()		SMC_inw(ioaddr, RCR_REG)
+
+#define SMC_SET_RCR(x)		SMC_outw(x, ioaddr, RCR_REG)
+
+#define SMC_GET_REV()		SMC_inw(ioaddr, REV_REG)
+
+#define SMC_GET_RPC()		SMC_inw(ioaddr, RPC_REG)
+
+#define SMC_SET_RPC(x)							\
+	do {								\
+		if (SMC_MUST_ALIGN_WRITE)				\
+			SMC_outl((x)<<16, ioaddr, SMC_REG(8, 0));	\
+		else							\
+			SMC_outw(x, ioaddr, RPC_REG);			\
+	} while (0)
+
+#define SMC_GET_TCR()		SMC_inw(ioaddr, TCR_REG)
+
+#define SMC_SET_TCR(x)		SMC_outw(x, ioaddr, TCR_REG)
 
 #ifndef SMC_GET_MAC_ADDR
 #define SMC_GET_MAC_ADDR(addr)						\
@@ -920,151 +1079,84 @@ static const char * chip_ids[ 16 ] =  {
 		SMC_outw( mt[6] | (mt[7] << 8), ioaddr, MCAST_REG4 );	\
 	} while (0)
 
-#if SMC_CAN_USE_32BIT
-/*
- * Some setups just can't write 8 or 16 bits reliably when not aligned
- * to a 32 bit boundary.  I tell you that exists!
- * We re-do the ones here that can be easily worked around if they can have
- * their low parts written to 0 without adverse effects.
- */
-#undef SMC_SELECT_BANK
-#define SMC_SELECT_BANK(x)	SMC_outl( (x)<<16, ioaddr, 12<<SMC_IO_SHIFT )
-#undef SMC_SET_RPC
-#define SMC_SET_RPC(x)		SMC_outl( (x)<<16, ioaddr, SMC_REG(8, 0) )
-#undef SMC_SET_PN
-#define SMC_SET_PN(x)		SMC_outl( (x)<<16, ioaddr, SMC_REG(0, 2) )
-#undef SMC_SET_PTR
-#define SMC_SET_PTR(x)		SMC_outl( (x)<<16, ioaddr, SMC_REG(4, 2) )
-#endif
-
-#if SMC_CAN_USE_32BIT
-#define SMC_PUT_PKT_HDR(status, length)					\
-	SMC_outl( (status) | (length) << 16, ioaddr, DATA_REG )
-#define SMC_GET_PKT_HDR(status, length)					\
-	do {								\
-		unsigned int __val = SMC_inl( ioaddr, DATA_REG );	\
-		(status) = __val & 0xffff;				\
-		(length) = __val >> 16;					\
-	} while (0)
-#else
 #define SMC_PUT_PKT_HDR(status, length)					\
 	do {								\
-		SMC_outw( status, ioaddr, DATA_REG );			\
-		SMC_outw( length, ioaddr, DATA_REG );			\
-	} while (0)
-#define SMC_GET_PKT_HDR(status, length)					\
-	do {								\
-		(status) = SMC_inw( ioaddr, DATA_REG );			\
-		(length) = SMC_inw( ioaddr, DATA_REG );			\
+		if (SMC_CAN_USE_32BIT)					\
+			SMC_outl((status) | (length)<<16, ioaddr, DATA_REG); \
+		else {							\
+			SMC_outw(status, ioaddr, DATA_REG);		\
+			SMC_outw(length, ioaddr, DATA_REG);		\
+		}							\
 	} while (0)
-#endif
 
-#if SMC_CAN_USE_32BIT
-#define _SMC_PUSH_DATA(p, l)						\
+#define SMC_GET_PKT_HDR(status, length)					\
 	do {								\
-		char *__ptr = (p);					\
-		int __len = (l);					\
-		if (__len >= 2 && (unsigned long)__ptr & 2) {		\
-			__len -= 2;					\
-			SMC_outw( *(u16 *)__ptr, ioaddr, DATA_REG );	\
-			__ptr += 2;					\
-		}							\
-		SMC_outsl( ioaddr, DATA_REG, __ptr, __len >> 2);	\
-		if (__len & 2) {					\
-			__ptr += (__len & ~3);				\
-			SMC_outw( *((u16 *)__ptr), ioaddr, DATA_REG );	\
+		if (SMC_CAN_USE_32BIT) {				\
+			unsigned int __val = SMC_inl(ioaddr, DATA_REG);	\
+			(status) = __val & 0xffff;			\
+			(length) = __val >> 16;				\
+		} else {						\
+			(status) = SMC_inw(ioaddr, DATA_REG);		\
+			(length) = SMC_inw(ioaddr, DATA_REG);		\
 		}							\
 	} while (0)
-#define _SMC_PULL_DATA(p, l)						\
-	do {								\
-		char *__ptr = (p);					\
-		int __len = (l);					\
-		if ((unsigned long)__ptr & 2) {				\
-			/*						\
-			 * We want 32bit alignment here.		\
-			 * Since some buses perform a full 32bit	\
-			 * fetch even for 16bit data we can't use	\
-			 * SMC_inw() here.  Back both source (on chip	\
-			 * and destination) pointers of 2 bytes.	\
-			 */						\
-			__ptr -= 2;					\
-			__len += 2;					\
-			SMC_SET_PTR( 2|PTR_READ|PTR_RCV|PTR_AUTOINC );	\
-		}							\
-		__len += 2;						\
-		SMC_insl( ioaddr, DATA_REG, __ptr, __len >> 2);		\
-	} while (0)
-#elif SMC_CAN_USE_16BIT
-#define _SMC_PUSH_DATA(p, l)	SMC_outsw( ioaddr, DATA_REG, p, (l) >> 1 )
-#define _SMC_PULL_DATA(p, l)	SMC_insw ( ioaddr, DATA_REG, p, (l) >> 1 )
-#elif SMC_CAN_USE_8BIT
-#define _SMC_PUSH_DATA(p, l)	SMC_outsb( ioaddr, DATA_REG, p, l )
-#define _SMC_PULL_DATA(p, l)	SMC_insb ( ioaddr, DATA_REG, p, l )
-#endif
 
-#if ! SMC_CAN_USE_16BIT
-#define SMC_outw(x, ioaddr, reg)					\
+#define SMC_PUSH_DATA(p, l)						\
 	do {								\
-		unsigned int __val16 = (x);				\
-		SMC_outb( __val16, ioaddr, reg );			\
-		SMC_outb( __val16 >> 8, ioaddr, reg + (1 << SMC_IO_SHIFT));\
+		if (SMC_CAN_USE_32BIT) {				\
+			void *__ptr = (p);				\
+			int __len = (l);				\
+			void *__ioaddr = ioaddr;			\
+			if (__len >= 2 && (unsigned long)__ptr & 2) {	\
+				__len -= 2;				\
+				SMC_outw(*(u16 *)__ptr, ioaddr, DATA_REG); \
+				__ptr += 2;				\
+			}						\
+			if (SMC_CAN_USE_DATACS && lp->datacs)		\
+				__ioaddr = lp->datacs;			\
+			SMC_outsl(__ioaddr, DATA_REG, __ptr, __len>>2);	\
+			if (__len & 2) {				\
+				__ptr += (__len & ~3);			\
+				SMC_outw(*((u16 *)__ptr), ioaddr, DATA_REG); \
+			}						\
+		} else if (SMC_CAN_USE_16BIT)				\
+			SMC_outsw(ioaddr, DATA_REG, p, (l) >> 1);	\
+		else if (SMC_CAN_USE_8BIT)				\
+			SMC_outsb(ioaddr, DATA_REG, p, l);		\
 	} while (0)
-#define SMC_inw(ioaddr, reg)						\
-	({								\
-		unsigned int __val16;					\
-		__val16 =  SMC_inb( ioaddr, reg );			\
-		__val16 |= SMC_inb( ioaddr, reg + (1 << SMC_IO_SHIFT)) << 8; \
-		__val16;						\
-	})
-#endif
-
-#ifdef SMC_CAN_USE_DATACS
-#define SMC_PUSH_DATA(p, l)						\
-	if ( lp->datacs ) {						\
-		unsigned char *__ptr = (p);				\
-		int __len = (l);					\
- 		if (__len >= 2 && (unsigned long)__ptr & 2) {		\
- 			__len -= 2;					\
- 			SMC_outw( *((u16 *)__ptr), ioaddr, DATA_REG );	\
- 			__ptr += 2;					\
- 		}							\
-		outsl(lp->datacs, __ptr, __len >> 2);			\
- 		if (__len & 2) {					\
- 			__ptr += (__len & ~3);				\
- 			SMC_outw( *((u16 *)__ptr), ioaddr, DATA_REG );	\
- 		}							\
-	} else {							\
-		_SMC_PUSH_DATA(p, l);					\
-	}
 
 #define SMC_PULL_DATA(p, l)						\
-	if ( lp->datacs ) { 						\
-		unsigned char *__ptr = (p);				\
-		int __len = (l);					\
-		if ((unsigned long)__ptr & 2) {			 	\
-			/*						\
-			 * We want 32bit alignment here.		\
-			 * Since some buses perform a full 32bit	\
-			 * fetch even for 16bit data we can't use	\
-			 * SMC_inw() here.  Back both source (on chip	\
-			 * and destination) pointers of 2 bytes.	\
-			 */						\
-			__ptr -= 2;					\
+	do {								\
+		if (SMC_CAN_USE_32BIT) {				\
+			void *__ptr = (p);				\
+			int __len = (l);				\
+			void *__ioaddr = ioaddr;			\
+			if ((unsigned long)__ptr & 2) {			\
+				/*					\
+				 * We want 32bit alignment here.	\
+				 * Since some buses perform a full	\
+				 * 32bit fetch even for 16bit data	\
+				 * we can't use SMC_inw() here.		\
+				 * Back both source (on-chip) and	\
+				 * destination pointers of 2 bytes.	\
+				 * This is possible since the call to	\
+				 * SMC_GET_PKT_HDR() already advanced	\
+				 * the source pointer of 4 bytes, and	\
+				 * the skb_reserve(skb, 2) advanced	\
+				 * the destination pointer of 2 bytes.	\
+				 */					\
+				__ptr -= 2;				\
+				__len += 2;				\
+				SMC_SET_PTR(2|PTR_READ|PTR_RCV|PTR_AUTOINC); \
+			}						\
+			if (SMC_CAN_USE_DATACS && lp->datacs)		\
+				__ioaddr = lp->datacs;			\
 			__len += 2;					\
-			SMC_SET_PTR( 2|PTR_READ|PTR_RCV|PTR_AUTOINC ); 	\
-		}							\
-		__len += 2;						\
-		insl( lp->datacs, __ptr, __len >> 2);			\
-	} else {							\
-		_SMC_PULL_DATA(p, l);					\
-	}
-#else
-#define SMC_PUSH_DATA(p, l) _SMC_PUSH_DATA(p, l)
-#define SMC_PULL_DATA(p, l) _SMC_PULL_DATA(p, l)
-#endif
-
-#if !defined (SMC_INTERRUPT_PREAMBLE)
-# define SMC_INTERRUPT_PREAMBLE
-#endif
+			SMC_insl(__ioaddr, DATA_REG, __ptr, __len>>2);	\
+		} else if (SMC_CAN_USE_16BIT)				\
+			SMC_insw(ioaddr, DATA_REG, p, (l) >> 1);	\
+		else if (SMC_CAN_USE_8BIT)				\
+			SMC_insb(ioaddr, DATA_REG, p, l);		\
+	} while (0)
 
 #endif  /* _SMC91X_H_ */
diff --git a/drivers/scsi/iscsi_tcp.c b/drivers/scsi/iscsi_tcp.c
index ff79e68b347..7b82ff090d4 100644
--- a/drivers/scsi/iscsi_tcp.c
+++ b/drivers/scsi/iscsi_tcp.c
@@ -3639,7 +3639,7 @@ iscsi_tcp_init(void)
 
 	taskcache = kmem_cache_create("iscsi_taskcache",
 			sizeof(struct iscsi_data_task), 0,
-			SLAB_HWCACHE_ALIGN | SLAB_NO_REAP, NULL, NULL);
+			SLAB_HWCACHE_ALIGN, NULL, NULL);
 	if (!taskcache)
 		return -ENOMEM;
 
diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index a8b05ce5de5..7405d0df95d 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -1139,32 +1139,6 @@ sg_fasync(int fd, struct file *filp, int mode)
 	return (retval < 0) ? retval : 0;
 }
 
-/* When startFinish==1 increments page counts for pages other than the 
-   first of scatter gather elements obtained from alloc_pages().
-   When startFinish==0 decrements ... */
-static void
-sg_rb_correct4mmap(Sg_scatter_hold * rsv_schp, int startFinish)
-{
-	struct scatterlist *sg = rsv_schp->buffer;
-	struct page *page;
-	int k, m;
-
-	SCSI_LOG_TIMEOUT(3, printk("sg_rb_correct4mmap: startFinish=%d, scatg=%d\n", 
-				   startFinish, rsv_schp->k_use_sg));
-	/* N.B. correction _not_ applied to base page of each allocation */
-	for (k = 0; k < rsv_schp->k_use_sg; ++k, ++sg) {
-		for (m = PAGE_SIZE; m < sg->length; m += PAGE_SIZE) {
-			page = sg->page;
-			if (startFinish)
-				get_page(page);
-			else {
-				if (page_count(page) > 0)
-					__put_page(page);
-			}
-		}
-	}
-}
-
 static struct page *
 sg_vma_nopage(struct vm_area_struct *vma, unsigned long addr, int *type)
 {
@@ -1236,10 +1210,7 @@ sg_mmap(struct file *filp, struct vm_area_struct *vma)
 		sa += len;
 	}
 
-	if (0 == sfp->mmap_called) {
-		sg_rb_correct4mmap(rsv_schp, 1);	/* do only once per fd lifetime */
-		sfp->mmap_called = 1;
-	}
+	sfp->mmap_called = 1;
 	vma->vm_flags |= VM_RESERVED;
 	vma->vm_private_data = sfp;
 	vma->vm_ops = &sg_mmap_vm_ops;
@@ -2388,8 +2359,6 @@ __sg_remove_sfp(Sg_device * sdp, Sg_fd * sfp)
 		SCSI_LOG_TIMEOUT(6, 
 			printk("__sg_remove_sfp:    bufflen=%d, k_use_sg=%d\n",
 			(int) sfp->reserve.bufflen, (int) sfp->reserve.k_use_sg));
-		if (sfp->mmap_called)
-			sg_rb_correct4mmap(&sfp->reserve, 0);	/* undo correction */
 		sg_remove_scat(&sfp->reserve);
 	}
 	sfp->parentdp = NULL;
@@ -2471,9 +2440,9 @@ sg_page_malloc(int rqSz, int lowDma, int *retSzp)
 		return resp;
 
 	if (lowDma)
-		page_mask = GFP_ATOMIC | GFP_DMA | __GFP_NOWARN;
+		page_mask = GFP_ATOMIC | GFP_DMA | __GFP_COMP | __GFP_NOWARN;
 	else
-		page_mask = GFP_ATOMIC | __GFP_NOWARN;
+		page_mask = GFP_ATOMIC | __GFP_COMP | __GFP_NOWARN;
 
 	for (order = 0, a_size = PAGE_SIZE; a_size < rqSz;
 	     order++, a_size <<= 1) ;
diff --git a/drivers/serial/Kconfig b/drivers/serial/Kconfig
index 89e5413cc2a..c66ef96c71b 100644
--- a/drivers/serial/Kconfig
+++ b/drivers/serial/Kconfig
@@ -866,7 +866,7 @@ config SERIAL_M32R_PLDSIO
 
 config SERIAL_TXX9
 	bool "TMPTX39XX/49XX SIO support"
-	depends HAS_TXX9_SERIAL && BROKEN
+	depends HAS_TXX9_SERIAL
 	select SERIAL_CORE
 	default y
 
diff --git a/drivers/serial/serial_txx9.c b/drivers/serial/serial_txx9.c
index ee98a867bc6..141173efd46 100644
--- a/drivers/serial/serial_txx9.c
+++ b/drivers/serial/serial_txx9.c
@@ -33,6 +33,10 @@
  *	1.02	Cleanup. (import 8250.c changes)
  *	1.03	Fix low-latency mode. (import 8250.c changes)
  *	1.04	Remove usage of deprecated functions, cleanup.
+ *	1.05	More strict check in verify_port.  Cleanup.
+ *	1.06	Do not insert a char caused previous overrun.
+ *		Fix some spin_locks.
+ *		Do not call uart_add_one_port for absent ports.
  */
 #include <linux/config.h>
 
@@ -57,7 +61,7 @@
 #include <asm/io.h>
 #include <asm/irq.h>
 
-static char *serial_version = "1.04";
+static char *serial_version = "1.06";
 static char *serial_name = "TX39/49 Serial driver";
 
 #define PASS_LIMIT	256
@@ -94,6 +98,8 @@ static char *serial_name = "TX39/49 Serial driver";
 #define UART_NR  4
 #endif
 
+#define HIGH_BITS_OFFSET	((sizeof(long)-sizeof(int))*8)
+
 struct uart_txx9_port {
 	struct uart_port	port;
 
@@ -210,7 +216,7 @@ static inline unsigned int sio_in(struct uart_txx9_port *up, int offset)
 {
 	switch (up->port.iotype) {
 	default:
-		return *(volatile u32 *)(up->port.membase + offset);
+		return __raw_readl(up->port.membase + offset);
 	case UPIO_PORT:
 		return inl(up->port.iobase + offset);
 	}
@@ -221,7 +227,7 @@ sio_out(struct uart_txx9_port *up, int offset, int value)
 {
 	switch (up->port.iotype) {
 	default:
-		*(volatile u32 *)(up->port.membase + offset) = value;
+		__raw_writel(value, up->port.membase + offset);
 		break;
 	case UPIO_PORT:
 		outl(value, up->port.iobase + offset);
@@ -259,34 +265,19 @@ sio_quot_set(struct uart_txx9_port *up, int quot)
 static void serial_txx9_stop_tx(struct uart_port *port)
 {
 	struct uart_txx9_port *up = (struct uart_txx9_port *)port;
-	unsigned long flags;
-
-	spin_lock_irqsave(&up->port.lock, flags);
 	sio_mask(up, TXX9_SIDICR, TXX9_SIDICR_TIE);
-	spin_unlock_irqrestore(&up->port.lock, flags);
 }
 
 static void serial_txx9_start_tx(struct uart_port *port)
 {
 	struct uart_txx9_port *up = (struct uart_txx9_port *)port;
-	unsigned long flags;
-
-	spin_lock_irqsave(&up->port.lock, flags);
 	sio_set(up, TXX9_SIDICR, TXX9_SIDICR_TIE);
-	spin_unlock_irqrestore(&up->port.lock, flags);
 }
 
 static void serial_txx9_stop_rx(struct uart_port *port)
 {
 	struct uart_txx9_port *up = (struct uart_txx9_port *)port;
-	unsigned long flags;
-
-	spin_lock_irqsave(&up->port.lock, flags);
 	up->port.read_status_mask &= ~TXX9_SIDISR_RDIS;
-#if 0
-	sio_mask(up, TXX9_SIDICR, TXX9_SIDICR_RIE);
-#endif
-	spin_unlock_irqrestore(&up->port.lock, flags);
 }
 
 static void serial_txx9_enable_ms(struct uart_port *port)
@@ -302,12 +293,16 @@ receive_chars(struct uart_txx9_port *up, unsigned int *status, struct pt_regs *r
 	unsigned int disr = *status;
 	int max_count = 256;
 	char flag;
+	unsigned int next_ignore_status_mask;
 
 	do {
 		ch = sio_in(up, TXX9_SIRFIFO);
 		flag = TTY_NORMAL;
 		up->port.icount.rx++;
 
+		/* mask out RFDN_MASK bit added by previous overrun */
+		next_ignore_status_mask =
+			up->port.ignore_status_mask & ~TXX9_SIDISR_RFDN_MASK;
 		if (unlikely(disr & (TXX9_SIDISR_UBRK | TXX9_SIDISR_UPER |
 				     TXX9_SIDISR_UFER | TXX9_SIDISR_UOER))) {
 			/*
@@ -328,8 +323,17 @@ receive_chars(struct uart_txx9_port *up, unsigned int *status, struct pt_regs *r
 				up->port.icount.parity++;
 			else if (disr & TXX9_SIDISR_UFER)
 				up->port.icount.frame++;
-			if (disr & TXX9_SIDISR_UOER)
+			if (disr & TXX9_SIDISR_UOER) {
 				up->port.icount.overrun++;
+				/*
+				 * The receiver read buffer still hold
+				 * a char which caused overrun.
+				 * Ignore next char by adding RFDN_MASK
+				 * to ignore_status_mask temporarily.
+				 */
+				next_ignore_status_mask |=
+					TXX9_SIDISR_RFDN_MASK;
+			}
 
 			/*
 			 * Mask off conditions which should be ingored.
@@ -349,6 +353,7 @@ receive_chars(struct uart_txx9_port *up, unsigned int *status, struct pt_regs *r
 		uart_insert_char(&up->port, disr, TXX9_SIDISR_UOER, ch, flag);
 
 	ignore_char:
+		up->port.ignore_status_mask = next_ignore_status_mask;
 		disr = sio_in(up, TXX9_SIDISR);
 	} while (!(disr & TXX9_SIDISR_UVALID) && (max_count-- > 0));
 	spin_unlock(&up->port.lock);
@@ -450,14 +455,11 @@ static unsigned int serial_txx9_get_mctrl(struct uart_port *port)
 static void serial_txx9_set_mctrl(struct uart_port *port, unsigned int mctrl)
 {
 	struct uart_txx9_port *up = (struct uart_txx9_port *)port;
-	unsigned long flags;
 
-	spin_lock_irqsave(&up->port.lock, flags);
 	if (mctrl & TIOCM_RTS)
 		sio_mask(up, TXX9_SIFLCR, TXX9_SIFLCR_RTSSC);
 	else
 		sio_set(up, TXX9_SIFLCR, TXX9_SIFLCR_RTSSC);
-	spin_unlock_irqrestore(&up->port.lock, flags);
 }
 
 static void serial_txx9_break_ctl(struct uart_port *port, int break_state)
@@ -784,8 +786,14 @@ static void serial_txx9_config_port(struct uart_port *port, int uflags)
 static int
 serial_txx9_verify_port(struct uart_port *port, struct serial_struct *ser)
 {
-	if (ser->irq < 0 ||
-	    ser->baud_base < 9600 || ser->type != PORT_TXX9)
+	unsigned long new_port = ser->port;
+	if (HIGH_BITS_OFFSET)
+		new_port += (unsigned long)ser->port_high << HIGH_BITS_OFFSET;
+	if (ser->type != port->type ||
+	    ser->irq != port->irq ||
+	    ser->io_type != port->iotype ||
+	    new_port != port->iobase ||
+	    (unsigned long)ser->iomem_base != port->mapbase)
 		return -EINVAL;
 	return 0;
 }
@@ -827,7 +835,8 @@ static void __init serial_txx9_register_ports(struct uart_driver *drv)
 
 		up->port.line = i;
 		up->port.ops = &serial_txx9_pops;
-		uart_add_one_port(drv, &up->port);
+		if (up->port.iobase || up->port.mapbase)
+			uart_add_one_port(drv, &up->port);
 	}
 }
 
@@ -927,11 +936,6 @@ static int serial_txx9_console_setup(struct console *co, char *options)
 		return -ENODEV;
 
 	/*
-	 * Temporary fix.
-	 */
-	spin_lock_init(&port->lock);
-
-	/*
 	 *	Disable UART interrupts, set DTR and RTS high
 	 *	and set speed.
 	 */
@@ -1041,11 +1045,10 @@ static int __devinit serial_txx9_register_port(struct uart_port *port)
 	mutex_lock(&serial_txx9_mutex);
 	for (i = 0; i < UART_NR; i++) {
 		uart = &serial_txx9_ports[i];
-		if (uart->port.type == PORT_UNKNOWN)
+		if (!(uart->port.iobase || uart->port.mapbase))
 			break;
 	}
 	if (i < UART_NR) {
-		uart_remove_one_port(&serial_txx9_reg, &uart->port);
 		uart->port.iobase = port->iobase;
 		uart->port.membase = port->membase;
 		uart->port.irq      = port->irq;
@@ -1080,9 +1083,8 @@ static void __devexit serial_txx9_unregister_port(int line)
 	uart->port.type = PORT_UNKNOWN;
 	uart->port.iobase = 0;
 	uart->port.mapbase = 0;
-	uart->port.membase = 0;
+	uart->port.membase = NULL;
 	uart->port.dev = NULL;
-	uart_add_one_port(&serial_txx9_reg, &uart->port);
 	mutex_unlock(&serial_txx9_mutex);
 }
 
@@ -1198,8 +1200,11 @@ static void __exit serial_txx9_exit(void)
 #ifdef ENABLE_SERIAL_TXX9_PCI
 	pci_unregister_driver(&serial_txx9_pci_driver);
 #endif
-	for (i = 0; i < UART_NR; i++)
-		uart_remove_one_port(&serial_txx9_reg, &serial_txx9_ports[i].port);
+	for (i = 0; i < UART_NR; i++) {
+		struct uart_txx9_port *up = &serial_txx9_ports[i];
+		if (up->port.iobase || up->port.mapbase)
+			uart_remove_one_port(&serial_txx9_reg, &up->port);
+	}
 
 	uart_unregister_driver(&serial_txx9_reg);
 }
diff --git a/drivers/serial/vr41xx_siu.c b/drivers/serial/vr41xx_siu.c
index d61494d185c..bd6294132c1 100644
--- a/drivers/serial/vr41xx_siu.c
+++ b/drivers/serial/vr41xx_siu.c
@@ -919,7 +919,7 @@ static struct uart_driver siu_uart_driver = {
 	.cons		= SERIAL_VR41XX_CONSOLE,
 };
 
-static int siu_probe(struct platform_device *dev)
+static int __devinit siu_probe(struct platform_device *dev)
 {
 	struct uart_port *port;
 	int num, i, retval;
@@ -953,7 +953,7 @@ static int siu_probe(struct platform_device *dev)
 	return 0;
 }
 
-static int siu_remove(struct platform_device *dev)
+static int __devexit siu_remove(struct platform_device *dev)
 {
 	struct uart_port *port;
 	int i;
@@ -1006,21 +1006,28 @@ static struct platform_device *siu_platform_device;
 
 static struct platform_driver siu_device_driver = {
 	.probe		= siu_probe,
-	.remove		= siu_remove,
+	.remove		= __devexit_p(siu_remove),
 	.suspend	= siu_suspend,
 	.resume		= siu_resume,
 	.driver		= {
 		.name	= "SIU",
+		.owner	= THIS_MODULE,
 	},
 };
 
-static int __devinit vr41xx_siu_init(void)
+static int __init vr41xx_siu_init(void)
 {
 	int retval;
 
-	siu_platform_device = platform_device_register_simple("SIU", -1, NULL, 0);
-	if (IS_ERR(siu_platform_device))
-		return PTR_ERR(siu_platform_device);
+	siu_platform_device = platform_device_alloc("SIU", -1);
+	if (!siu_platform_device)
+		return -ENOMEM;
+
+	retval = platform_device_add(siu_platform_device);
+	if (retval < 0) {
+		platform_device_put(siu_platform_device);
+		return retval;
+	}
 
 	retval = platform_driver_register(&siu_device_driver);
 	if (retval < 0)
@@ -1029,10 +1036,9 @@ static int __devinit vr41xx_siu_init(void)
 	return retval;
 }
 
-static void __devexit vr41xx_siu_exit(void)
+static void __exit vr41xx_siu_exit(void)
 {
 	platform_driver_unregister(&siu_device_driver);
-
 	platform_device_unregister(siu_platform_device);
 }
 
diff --git a/drivers/sn/ioc4.c b/drivers/sn/ioc4.c
index ea75b3d0612..67140a5804f 100644
--- a/drivers/sn/ioc4.c
+++ b/drivers/sn/ioc4.c
@@ -31,7 +31,7 @@
 #include <linux/ioc4.h>
 #include <linux/mmtimer.h>
 #include <linux/rtc.h>
-#include <linux/rwsem.h>
+#include <linux/mutex.h>
 #include <asm/sn/addrs.h>
 #include <asm/sn/clksupport.h>
 #include <asm/sn/shub_mmr.h>
@@ -54,11 +54,10 @@
  * Submodule management *
  ************************/
 
-static LIST_HEAD(ioc4_devices);
-static DECLARE_RWSEM(ioc4_devices_rwsem);
+static DEFINE_MUTEX(ioc4_mutex);
 
+static LIST_HEAD(ioc4_devices);
 static LIST_HEAD(ioc4_submodules);
-static DECLARE_RWSEM(ioc4_submodules_rwsem);
 
 /* Register an IOC4 submodule */
 int
@@ -66,15 +65,13 @@ ioc4_register_submodule(struct ioc4_submodule *is)
 {
 	struct ioc4_driver_data *idd;
 
-	down_write(&ioc4_submodules_rwsem);
+	mutex_lock(&ioc4_mutex);
 	list_add(&is->is_list, &ioc4_submodules);
-	up_write(&ioc4_submodules_rwsem);
 
 	/* Initialize submodule for each IOC4 */
 	if (!is->is_probe)
-		return 0;
+		goto out;
 
-	down_read(&ioc4_devices_rwsem);
 	list_for_each_entry(idd, &ioc4_devices, idd_list) {
 		if (is->is_probe(idd)) {
 			printk(KERN_WARNING
@@ -84,8 +81,8 @@ ioc4_register_submodule(struct ioc4_submodule *is)
 			       pci_name(idd->idd_pdev));
 		}
 	}
-	up_read(&ioc4_devices_rwsem);
-
+ out:
+	mutex_unlock(&ioc4_mutex);
 	return 0;
 }
 
@@ -95,15 +92,13 @@ ioc4_unregister_submodule(struct ioc4_submodule *is)
 {
 	struct ioc4_driver_data *idd;
 
-	down_write(&ioc4_submodules_rwsem);
+	mutex_lock(&ioc4_mutex);
 	list_del(&is->is_list);
-	up_write(&ioc4_submodules_rwsem);
 
 	/* Remove submodule for each IOC4 */
 	if (!is->is_remove)
-		return;
+		goto out;
 
-	down_read(&ioc4_devices_rwsem);
 	list_for_each_entry(idd, &ioc4_devices, idd_list) {
 		if (is->is_remove(idd)) {
 			printk(KERN_WARNING
@@ -113,7 +108,8 @@ ioc4_unregister_submodule(struct ioc4_submodule *is)
 			       pci_name(idd->idd_pdev));
 		}
 	}
-	up_read(&ioc4_devices_rwsem);
+ out:
+	mutex_unlock(&ioc4_mutex);
 }
 
 /*********************
@@ -312,12 +308,11 @@ ioc4_probe(struct pci_dev *pdev, const struct pci_device_id *pci_id)
 	/* Track PCI-device specific data */
 	idd->idd_serial_data = NULL;
 	pci_set_drvdata(idd->idd_pdev, idd);
-	down_write(&ioc4_devices_rwsem);
+
+	mutex_lock(&ioc4_mutex);
 	list_add(&idd->idd_list, &ioc4_devices);
-	up_write(&ioc4_devices_rwsem);
 
 	/* Add this IOC4 to all submodules */
-	down_read(&ioc4_submodules_rwsem);
 	list_for_each_entry(is, &ioc4_submodules, is_list) {
 		if (is->is_probe && is->is_probe(idd)) {
 			printk(KERN_WARNING
@@ -327,7 +322,7 @@ ioc4_probe(struct pci_dev *pdev, const struct pci_device_id *pci_id)
 			       pci_name(idd->idd_pdev));
 		}
 	}
-	up_read(&ioc4_submodules_rwsem);
+	mutex_unlock(&ioc4_mutex);
 
 	return 0;
 
@@ -351,7 +346,7 @@ ioc4_remove(struct pci_dev *pdev)
 	idd = pci_get_drvdata(pdev);
 
 	/* Remove this IOC4 from all submodules */
-	down_read(&ioc4_submodules_rwsem);
+	mutex_lock(&ioc4_mutex);
 	list_for_each_entry(is, &ioc4_submodules, is_list) {
 		if (is->is_remove && is->is_remove(idd)) {
 			printk(KERN_WARNING
@@ -361,7 +356,7 @@ ioc4_remove(struct pci_dev *pdev)
 			       pci_name(idd->idd_pdev));
 		}
 	}
-	up_read(&ioc4_submodules_rwsem);
+	mutex_unlock(&ioc4_mutex);
 
 	/* Release resources */
 	iounmap(idd->idd_misc_regs);
@@ -377,9 +372,9 @@ ioc4_remove(struct pci_dev *pdev)
 	pci_disable_device(pdev);
 
 	/* Remove and free driver data */
-	down_write(&ioc4_devices_rwsem);
+	mutex_lock(&ioc4_mutex);
 	list_del(&idd->idd_list);
-	up_write(&ioc4_devices_rwsem);
+	mutex_unlock(&ioc4_mutex);
 	kfree(idd);
 }
 
diff --git a/drivers/video/acornfb.c b/drivers/video/acornfb.c
index b058273527b..76448d6ae89 100644
--- a/drivers/video/acornfb.c
+++ b/drivers/video/acornfb.c
@@ -1269,7 +1269,7 @@ free_unused_pages(unsigned int virtual_start, unsigned int virtual_end)
 		 */
 		page = virt_to_page(virtual_start);
 		ClearPageReserved(page);
-		set_page_count(page, 1);
+		init_page_count(page);
 		free_page(virtual_start);
 
 		virtual_start += PAGE_SIZE;
diff --git a/drivers/video/i810/i810_main.c b/drivers/video/i810/i810_main.c
index d8467c03b49..788297e9d59 100644
--- a/drivers/video/i810/i810_main.c
+++ b/drivers/video/i810/i810_main.c
@@ -1508,7 +1508,7 @@ static int i810fb_cursor(struct fb_info *info, struct fb_cursor *cursor)
 		int size = ((cursor->image.width + 7) >> 3) *
 			cursor->image.height;
 		int i;
-		u8 *data = kmalloc(64 * 8, GFP_KERNEL);
+		u8 *data = kmalloc(64 * 8, GFP_ATOMIC);
 
 		if (data == NULL)
 			return -ENOMEM;
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 3ad8455f857..651a9e14d9a 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -614,6 +614,7 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
 
 	sb = dir->i_sb;
 	v9ses = v9fs_inode2v9ses(dir);
+	dentry->d_op = &v9fs_dentry_operations;
 	dirfid = v9fs_fid_lookup(dentry->d_parent);
 
 	if (!dirfid) {
@@ -681,8 +682,6 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
 		goto FreeFcall;
 
 	fid->qid = fcall->params.rstat.stat.qid;
-
-	dentry->d_op = &v9fs_dentry_operations;
 	v9fs_stat2inode(&fcall->params.rstat.stat, inode, inode->i_sb);
 
 	d_add(dentry, inode);
diff --git a/fs/buffer.c b/fs/buffer.c
index a9b39940200..1d3683d496f 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -3051,68 +3051,6 @@ asmlinkage long sys_bdflush(int func, long data)
 }
 
 /*
- * Migration function for pages with buffers. This function can only be used
- * if the underlying filesystem guarantees that no other references to "page"
- * exist.
- */
-#ifdef CONFIG_MIGRATION
-int buffer_migrate_page(struct page *newpage, struct page *page)
-{
-	struct address_space *mapping = page->mapping;
-	struct buffer_head *bh, *head;
-	int rc;
-
-	if (!mapping)
-		return -EAGAIN;
-
-	if (!page_has_buffers(page))
-		return migrate_page(newpage, page);
-
-	head = page_buffers(page);
-
-	rc = migrate_page_remove_references(newpage, page, 3);
-	if (rc)
-		return rc;
-
-	bh = head;
-	do {
-		get_bh(bh);
-		lock_buffer(bh);
-		bh = bh->b_this_page;
-
-	} while (bh != head);
-
-	ClearPagePrivate(page);
-	set_page_private(newpage, page_private(page));
-	set_page_private(page, 0);
-	put_page(page);
-	get_page(newpage);
-
-	bh = head;
-	do {
-		set_bh_page(bh, newpage, bh_offset(bh));
-		bh = bh->b_this_page;
-
-	} while (bh != head);
-
-	SetPagePrivate(newpage);
-
-	migrate_page_copy(newpage, page);
-
-	bh = head;
-	do {
-		unlock_buffer(bh);
- 		put_bh(bh);
-		bh = bh->b_this_page;
-
-	} while (bh != head);
-
-	return 0;
-}
-EXPORT_SYMBOL(buffer_migrate_page);
-#endif
-
-/*
  * Buffer-head allocation
  */
 static kmem_cache_t *bh_cachep;
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index b3519528994..25fa8bba8cb 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -56,48 +56,10 @@ static void huge_pagevec_release(struct pagevec *pvec)
 	pagevec_reinit(pvec);
 }
 
-/*
- * huge_pages_needed tries to determine the number of new huge pages that
- * will be required to fully populate this VMA.  This will be equal to
- * the size of the VMA in huge pages minus the number of huge pages
- * (covered by this VMA) that are found in the page cache.
- *
- * Result is in bytes to be compatible with is_hugepage_mem_enough()
- */
-static unsigned long
-huge_pages_needed(struct address_space *mapping, struct vm_area_struct *vma)
-{
-	int i;
-	struct pagevec pvec;
-	unsigned long start = vma->vm_start;
-	unsigned long end = vma->vm_end;
-	unsigned long hugepages = (end - start) >> HPAGE_SHIFT;
-	pgoff_t next = vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT);
-	pgoff_t endpg = next + hugepages;
-
-	pagevec_init(&pvec, 0);
-	while (next < endpg) {
-		if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE))
-			break;
-		for (i = 0; i < pagevec_count(&pvec); i++) {
-			struct page *page = pvec.pages[i];
-			if (page->index > next)
-				next = page->index;
-			if (page->index >= endpg)
-				break;
-			next++;
-			hugepages--;
-		}
-		huge_pagevec_release(&pvec);
-	}
-	return hugepages << HPAGE_SHIFT;
-}
-
 static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 {
 	struct inode *inode = file->f_dentry->d_inode;
-	struct address_space *mapping = inode->i_mapping;
-	unsigned long bytes;
+	struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);
 	loff_t len, vma_len;
 	int ret;
 
@@ -113,10 +75,6 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 	if (vma->vm_end - vma->vm_start < HPAGE_SIZE)
 		return -EINVAL;
 
-	bytes = huge_pages_needed(mapping, vma);
-	if (!is_hugepage_mem_enough(bytes))
-		return -ENOMEM;
-
 	vma_len = (loff_t)(vma->vm_end - vma->vm_start);
 
 	mutex_lock(&inode->i_mutex);
@@ -129,6 +87,10 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 	if (!(vma->vm_flags & VM_WRITE) && len > inode->i_size)
 		goto out;
 
+	if (vma->vm_flags & VM_MAYSHARE)
+		if (hugetlb_extend_reservation(info, len >> HPAGE_SHIFT) != 0)
+			goto out;
+
 	ret = 0;
 	hugetlb_prefault_arch_hook(vma->vm_mm);
 	if (inode->i_size < len)
@@ -227,13 +189,18 @@ static void truncate_huge_page(struct page *page)
 	put_page(page);
 }
 
-static void truncate_hugepages(struct address_space *mapping, loff_t lstart)
+static void truncate_hugepages(struct inode *inode, loff_t lstart)
 {
+	struct address_space *mapping = &inode->i_data;
 	const pgoff_t start = lstart >> HPAGE_SHIFT;
 	struct pagevec pvec;
 	pgoff_t next;
 	int i;
 
+	hugetlb_truncate_reservation(HUGETLBFS_I(inode),
+				     lstart >> HPAGE_SHIFT);
+	if (!mapping->nrpages)
+		return;
 	pagevec_init(&pvec, 0);
 	next = start;
 	while (1) {
@@ -262,8 +229,7 @@ static void truncate_hugepages(struct address_space *mapping, loff_t lstart)
 
 static void hugetlbfs_delete_inode(struct inode *inode)
 {
-	if (inode->i_data.nrpages)
-		truncate_hugepages(&inode->i_data, 0);
+	truncate_hugepages(inode, 0);
 	clear_inode(inode);
 }
 
@@ -296,8 +262,7 @@ static void hugetlbfs_forget_inode(struct inode *inode)
 	inode->i_state |= I_FREEING;
 	inodes_stat.nr_inodes--;
 	spin_unlock(&inode_lock);
-	if (inode->i_data.nrpages)
-		truncate_hugepages(&inode->i_data, 0);
+	truncate_hugepages(inode, 0);
 	clear_inode(inode);
 	destroy_inode(inode);
 }
@@ -356,7 +321,7 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
 	if (!prio_tree_empty(&mapping->i_mmap))
 		hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff);
 	spin_unlock(&mapping->i_mmap_lock);
-	truncate_hugepages(mapping, offset);
+	truncate_hugepages(inode, offset);
 	return 0;
 }
 
@@ -573,6 +538,7 @@ static struct inode *hugetlbfs_alloc_inode(struct super_block *sb)
 		hugetlbfs_inc_free_inodes(sbinfo);
 		return NULL;
 	}
+	p->prereserved_hpages = 0;
 	return &p->vfs_inode;
 }
 
@@ -771,21 +737,6 @@ static struct file_system_type hugetlbfs_fs_type = {
 
 static struct vfsmount *hugetlbfs_vfsmount;
 
-/*
- * Return the next identifier for a shm file
- */
-static unsigned long hugetlbfs_counter(void)
-{
-	static DEFINE_SPINLOCK(lock);
-	static unsigned long counter;
-	unsigned long ret;
-
-	spin_lock(&lock);
-	ret = ++counter;
-	spin_unlock(&lock);
-	return ret;
-}
-
 static int can_do_hugetlb_shm(void)
 {
 	return likely(capable(CAP_IPC_LOCK) ||
@@ -801,18 +752,16 @@ struct file *hugetlb_zero_setup(size_t size)
 	struct dentry *dentry, *root;
 	struct qstr quick_string;
 	char buf[16];
+	static atomic_t counter;
 
 	if (!can_do_hugetlb_shm())
 		return ERR_PTR(-EPERM);
 
-	if (!is_hugepage_mem_enough(size))
-		return ERR_PTR(-ENOMEM);
-
 	if (!user_shm_lock(size, current->user))
 		return ERR_PTR(-ENOMEM);
 
 	root = hugetlbfs_vfsmount->mnt_root;
-	snprintf(buf, 16, "%lu", hugetlbfs_counter());
+	snprintf(buf, 16, "%u", atomic_inc_return(&counter));
 	quick_string.name = buf;
 	quick_string.len = strlen(quick_string.name);
 	quick_string.hash = 0;
@@ -831,6 +780,11 @@ struct file *hugetlb_zero_setup(size_t size)
 	if (!inode)
 		goto out_file;
 
+	error = -ENOMEM;
+	if (hugetlb_extend_reservation(HUGETLBFS_I(inode),
+				       size >> HPAGE_SHIFT) != 0)
+		goto out_inode;
+
 	d_instantiate(dentry, inode);
 	inode->i_size = size;
 	inode->i_nlink = 0;
@@ -841,6 +795,8 @@ struct file *hugetlb_zero_setup(size_t size)
 	file->f_mode = FMODE_WRITE | FMODE_READ;
 	return file;
 
+out_inode:
+	iput(inode);
 out_file:
 	put_filp(file);
 out_dentry:
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 8dd3aafec49..09e1c57a86a 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -959,7 +959,7 @@ static int ocfs2_initialize_mem_caches(void)
 	ocfs2_lock_cache = kmem_cache_create("ocfs2_lock",
 					     sizeof(struct ocfs2_journal_lock),
 					     0,
-					     SLAB_NO_REAP|SLAB_HWCACHE_ALIGN,
+					     SLAB_HWCACHE_ALIGN,
 					     NULL, NULL);
 	if (!ocfs2_lock_cache)
 		return -ENOMEM;
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index 3f810acd0bf..b1ca234068f 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -87,8 +87,7 @@ static int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize)
 	xpages = 1UL << order;
 	npages = (newsize + PAGE_SIZE - 1) >> PAGE_SHIFT;
 
-	for (loop = 0; loop < npages; loop++)
-		set_page_count(pages + loop, 1);
+	split_page(pages, order);
 
 	/* trim off any pages we don't actually require */
 	for (loop = npages; loop < xpages; loop++)
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index bfb4f2917bb..8cdfa415165 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -29,6 +29,7 @@
 #include <linux/blkdev.h>
 #include <linux/hash.h>
 #include <linux/kthread.h>
+#include <linux/migrate.h>
 #include "xfs_linux.h"
 
 STATIC kmem_zone_t *xfs_buf_zone;
diff --git a/include/asm-i386/acpi.h b/include/asm-i386/acpi.h
index 55059abf9c9..20f52395421 100644
--- a/include/asm-i386/acpi.h
+++ b/include/asm-i386/acpi.h
@@ -103,6 +103,12 @@ __acpi_release_global_lock (unsigned int *lock)
         :"=r"(n_hi), "=r"(n_lo)     \
         :"0"(n_hi), "1"(n_lo))
 
+#ifdef CONFIG_X86_IO_APIC
+extern void check_acpi_pci(void);
+#else
+static inline void check_acpi_pci(void) { }
+#endif
+
 #ifdef CONFIG_ACPI 
 extern int acpi_lapic;
 extern int acpi_ioapic;
@@ -128,8 +134,6 @@ extern int acpi_gsi_to_irq(u32 gsi, unsigned int *irq);
 extern int skip_ioapic_setup;
 extern int acpi_skip_timer_override;
 
-extern void check_acpi_pci(void);
-
 static inline void disable_ioapic_setup(void)
 {
 	skip_ioapic_setup = 1;
@@ -142,8 +146,6 @@ static inline int ioapic_setup_disabled(void)
 
 #else
 static inline void disable_ioapic_setup(void) { }
-static inline void check_acpi_pci(void) { }
-
 #endif
 
 static inline void acpi_noirq_set(void) { acpi_noirq = 1; }
diff --git a/include/asm-i386/pgtable.h b/include/asm-i386/pgtable.h
index 088a945bf26..ee056c41a9f 100644
--- a/include/asm-i386/pgtable.h
+++ b/include/asm-i386/pgtable.h
@@ -219,13 +219,12 @@ extern unsigned long pg0[];
  * The following only work if pte_present() is true.
  * Undefined behaviour if not..
  */
-#define __LARGE_PTE (_PAGE_PSE | _PAGE_PRESENT)
 static inline int pte_user(pte_t pte)		{ return (pte).pte_low & _PAGE_USER; }
 static inline int pte_read(pte_t pte)		{ return (pte).pte_low & _PAGE_USER; }
 static inline int pte_dirty(pte_t pte)		{ return (pte).pte_low & _PAGE_DIRTY; }
 static inline int pte_young(pte_t pte)		{ return (pte).pte_low & _PAGE_ACCESSED; }
 static inline int pte_write(pte_t pte)		{ return (pte).pte_low & _PAGE_RW; }
-static inline int pte_huge(pte_t pte)		{ return ((pte).pte_low & __LARGE_PTE) == __LARGE_PTE; }
+static inline int pte_huge(pte_t pte)		{ return (pte).pte_low & _PAGE_PSE; }
 
 /*
  * The following only works if pte_present() is not true.
@@ -242,7 +241,7 @@ static inline pte_t pte_mkexec(pte_t pte)	{ (pte).pte_low |= _PAGE_USER; return
 static inline pte_t pte_mkdirty(pte_t pte)	{ (pte).pte_low |= _PAGE_DIRTY; return pte; }
 static inline pte_t pte_mkyoung(pte_t pte)	{ (pte).pte_low |= _PAGE_ACCESSED; return pte; }
 static inline pte_t pte_mkwrite(pte_t pte)	{ (pte).pte_low |= _PAGE_RW; return pte; }
-static inline pte_t pte_mkhuge(pte_t pte)	{ (pte).pte_low |= __LARGE_PTE; return pte; }
+static inline pte_t pte_mkhuge(pte_t pte)	{ (pte).pte_low |= _PAGE_PSE; return pte; }
 
 #ifdef CONFIG_X86_PAE
 # include <asm/pgtable-3level.h>
diff --git a/include/asm-ia64/intel_intrin.h b/include/asm-ia64/intel_intrin.h
index a7122d85017..d069b6acddc 100644
--- a/include/asm-ia64/intel_intrin.h
+++ b/include/asm-ia64/intel_intrin.h
@@ -5,113 +5,10 @@
  *
  * Copyright (C) 2002,2003 Jun Nakajima <jun.nakajima@intel.com>
  * Copyright (C) 2002,2003 Suresh Siddha <suresh.b.siddha@intel.com>
+ * Copyright (C) 2005,2006 Hongjiu Lu <hongjiu.lu@intel.com>
  *
  */
-#include <asm/types.h>
-
-void  __lfetch(int lfhint, void *y);
-void  __lfetch_excl(int lfhint, void *y);
-void  __lfetch_fault(int lfhint, void *y);
-void  __lfetch_fault_excl(int lfhint, void *y);
-
-/* In the following, whichFloatReg should be an integer from 0-127 */
-void  __ldfs(const int whichFloatReg, void *src);
-void  __ldfd(const int whichFloatReg, void *src);
-void  __ldfe(const int whichFloatReg, void *src);
-void  __ldf8(const int whichFloatReg, void *src);
-void  __ldf_fill(const int whichFloatReg, void *src);
-void  __stfs(void *dst, const int whichFloatReg);
-void  __stfd(void *dst, const int whichFloatReg);
-void  __stfe(void *dst, const int whichFloatReg);
-void  __stf8(void *dst, const int whichFloatReg);
-void  __stf_spill(void *dst, const int whichFloatReg);
-
-void  __st1_rel(void *dst, const __s8  value);
-void  __st2_rel(void *dst, const __s16 value);
-void  __st4_rel(void *dst, const __s32 value);
-void  __st8_rel(void *dst, const __s64 value);
-__u8  __ld1_acq(void *src);
-__u16 __ld2_acq(void *src);
-__u32 __ld4_acq(void *src);
-__u64 __ld8_acq(void *src);
-
-__u64 __fetchadd4_acq(__u32 *addend, const int increment);
-__u64 __fetchadd4_rel(__u32 *addend, const int increment);
-__u64 __fetchadd8_acq(__u64 *addend, const int increment);
-__u64 __fetchadd8_rel(__u64 *addend, const int increment);
-
-__u64 __getf_exp(double d);
-
-/* OS Related Itanium(R) Intrinsics  */
-
-/* The names to use for whichReg and whichIndReg below come from
-   the include file asm/ia64regs.h */
-
-__u64 __getIndReg(const int whichIndReg, __s64 index);
-__u64 __getReg(const int whichReg);
-
-void  __setIndReg(const int whichIndReg, __s64 index, __u64 value);
-void  __setReg(const int whichReg, __u64 value);
-
-void  __mf(void);
-void  __mfa(void);
-void  __synci(void);
-void  __itcd(__s64 pa);
-void  __itci(__s64 pa);
-void  __itrd(__s64 whichTransReg, __s64 pa);
-void  __itri(__s64 whichTransReg, __s64 pa);
-void  __ptce(__s64 va);
-void  __ptcl(__s64 va, __s64 pagesz);
-void  __ptcg(__s64 va, __s64 pagesz);
-void  __ptcga(__s64 va, __s64 pagesz);
-void  __ptri(__s64 va, __s64 pagesz);
-void  __ptrd(__s64 va, __s64 pagesz);
-void  __invala (void);
-void  __invala_gr(const int whichGeneralReg /* 0-127 */ );
-void  __invala_fr(const int whichFloatReg /* 0-127 */ );
-void  __nop(const int);
-void  __fc(__u64 *addr);
-void  __sum(int mask);
-void  __rum(int mask);
-void  __ssm(int mask);
-void  __rsm(int mask);
-__u64 __thash(__s64);
-__u64 __ttag(__s64);
-__s64 __tpa(__s64);
-
-/* Intrinsics for implementing get/put_user macros */
-void __st_user(const char *tableName, __u64 addr, char size, char relocType, __u64 val);
-void __ld_user(const char *tableName, __u64 addr, char size, char relocType);
-
-/* This intrinsic does not generate code, it creates a barrier across which
- * the compiler will not schedule data access instructions.
- */
-void __memory_barrier(void);
-
-void __isrlz(void);
-void __dsrlz(void);
-
-__u64  _m64_mux1(__u64 a, const int n);
-__u64  __thash(__u64);
-
-/* Lock and Atomic Operation Related Intrinsics */
-__u64 _InterlockedExchange8(volatile __u8 *trgt, __u8 value);
-__u64 _InterlockedExchange16(volatile __u16 *trgt, __u16 value);
-__s64 _InterlockedExchange(volatile __u32 *trgt, __u32 value);
-__s64 _InterlockedExchange64(volatile __u64 *trgt, __u64 value);
-
-__u64 _InterlockedCompareExchange8_rel(volatile __u8 *dest, __u64 xchg, __u64 comp);
-__u64 _InterlockedCompareExchange8_acq(volatile __u8 *dest, __u64 xchg, __u64 comp);
-__u64 _InterlockedCompareExchange16_rel(volatile __u16 *dest, __u64 xchg, __u64 comp);
-__u64 _InterlockedCompareExchange16_acq(volatile __u16 *dest, __u64 xchg, __u64 comp);
-__u64 _InterlockedCompareExchange_rel(volatile __u32 *dest, __u64 xchg, __u64 comp);
-__u64 _InterlockedCompareExchange_acq(volatile __u32 *dest, __u64 xchg, __u64 comp);
-__u64 _InterlockedCompareExchange64_rel(volatile __u64 *dest, __u64 xchg, __u64 comp);
-__u64 _InterlockedCompareExchange64_acq(volatile __u64 *dest, __u64 xchg, __u64 comp);
-
-__s64 _m64_dep_mi(const int v, __s64 s, const int p, const int len);
-__s64 _m64_shrp(__s64 a, __s64 b, const int count);
-__s64 _m64_popcnt(__s64 a);
+#include <ia64intrin.h>
 
 #define ia64_barrier()		__memory_barrier()
 
@@ -122,15 +19,16 @@ __s64 _m64_popcnt(__s64 a);
 #define ia64_getreg		__getReg
 #define ia64_setreg		__setReg
 
-#define ia64_hint(x)
+#define ia64_hint		__hint
+#define ia64_hint_pause		__hint_pause
 
-#define ia64_mux1_brcst	 0
-#define ia64_mux1_mix		 8
-#define ia64_mux1_shuf		 9
-#define ia64_mux1_alt		10
-#define ia64_mux1_rev		11
+#define ia64_mux1_brcst		_m64_mux1_brcst
+#define ia64_mux1_mix		_m64_mux1_mix
+#define ia64_mux1_shuf		_m64_mux1_shuf
+#define ia64_mux1_alt		_m64_mux1_alt
+#define ia64_mux1_rev		_m64_mux1_rev
 
-#define ia64_mux1		_m64_mux1
+#define ia64_mux1(x,v)		_m_to_int64(_m64_mux1(_m_from_int64(x), (v)))
 #define ia64_popcnt		_m64_popcnt
 #define ia64_getf_exp		__getf_exp
 #define ia64_shrp		_m64_shrp
@@ -158,7 +56,7 @@ __s64 _m64_popcnt(__s64 a);
 #define ia64_stf8		__stf8
 #define ia64_stf_spill		__stf_spill
 
-#define ia64_mf		__mf
+#define ia64_mf			__mf
 #define ia64_mfa		__mfa
 
 #define ia64_fetchadd4_acq	__fetchadd4_acq
@@ -234,10 +132,10 @@ __s64 _m64_popcnt(__s64 a);
 
 /* Values for lfhint in __lfetch and __lfetch_fault */
 
-#define ia64_lfhint_none   	0
-#define ia64_lfhint_nt1    	1
-#define ia64_lfhint_nt2    	2
-#define ia64_lfhint_nta    	3
+#define ia64_lfhint_none	__lfhint_none
+#define ia64_lfhint_nt1		__lfhint_nt1
+#define ia64_lfhint_nt2		__lfhint_nt2
+#define ia64_lfhint_nta		__lfhint_nta
 
 #define ia64_lfetch		__lfetch
 #define ia64_lfetch_excl	__lfetch_excl
@@ -254,4 +152,6 @@ do {							\
 	}						\
 } while (0)
 
+#define __builtin_trap()	__break(0);
+
 #endif /* _ASM_IA64_INTEL_INTRIN_H */
diff --git a/include/asm-ia64/machvec.h b/include/asm-ia64/machvec.h
index ca5ea994d68..c3e4ed8a3e1 100644
--- a/include/asm-ia64/machvec.h
+++ b/include/asm-ia64/machvec.h
@@ -20,6 +20,7 @@ struct scatterlist;
 struct page;
 struct mm_struct;
 struct pci_bus;
+struct task_struct;
 
 typedef void ia64_mv_setup_t (char **);
 typedef void ia64_mv_cpu_init_t (void);
@@ -34,6 +35,7 @@ typedef int ia64_mv_pci_legacy_read_t (struct pci_bus *, u16 port, u32 *val,
 				       u8 size);
 typedef int ia64_mv_pci_legacy_write_t (struct pci_bus *, u16 port, u32 val,
 					u8 size);
+typedef void ia64_mv_migrate_t(struct task_struct * task);
 
 /* DMA-mapping interface: */
 typedef void ia64_mv_dma_init (void);
@@ -85,6 +87,11 @@ machvec_noop_mm (struct mm_struct *mm)
 {
 }
 
+static inline void
+machvec_noop_task (struct task_struct *task)
+{
+}
+
 extern void machvec_setup (char **);
 extern void machvec_timer_interrupt (int, void *, struct pt_regs *);
 extern void machvec_dma_sync_single (struct device *, dma_addr_t, size_t, int);
@@ -146,6 +153,7 @@ extern void machvec_tlb_migrate_finish (struct mm_struct *);
 #  define platform_readw_relaxed        ia64_mv.readw_relaxed
 #  define platform_readl_relaxed        ia64_mv.readl_relaxed
 #  define platform_readq_relaxed        ia64_mv.readq_relaxed
+#  define platform_migrate		ia64_mv.migrate
 # endif
 
 /* __attribute__((__aligned__(16))) is required to make size of the
@@ -194,6 +202,7 @@ struct ia64_machine_vector {
 	ia64_mv_readw_relaxed_t *readw_relaxed;
 	ia64_mv_readl_relaxed_t *readl_relaxed;
 	ia64_mv_readq_relaxed_t *readq_relaxed;
+	ia64_mv_migrate_t *migrate;
 } __attribute__((__aligned__(16))); /* align attrib? see above comment */
 
 #define MACHVEC_INIT(name)			\
@@ -238,6 +247,7 @@ struct ia64_machine_vector {
 	platform_readw_relaxed,			\
 	platform_readl_relaxed,			\
 	platform_readq_relaxed,			\
+	platform_migrate,			\
 }
 
 extern struct ia64_machine_vector ia64_mv;
@@ -386,5 +396,8 @@ extern ia64_mv_dma_supported		swiotlb_dma_supported;
 #ifndef platform_readq_relaxed
 # define platform_readq_relaxed	__ia64_readq_relaxed
 #endif
+#ifndef platform_migrate
+# define platform_migrate machvec_noop_task
+#endif
 
 #endif /* _ASM_IA64_MACHVEC_H */
diff --git a/include/asm-ia64/machvec_sn2.h b/include/asm-ia64/machvec_sn2.h
index 03d00faf03b..da1d43755af 100644
--- a/include/asm-ia64/machvec_sn2.h
+++ b/include/asm-ia64/machvec_sn2.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2002-2003, 2006 Silicon Graphics, Inc.  All Rights Reserved.
+ * Copyright (c) 2002-2003,2006 Silicon Graphics, Inc.  All Rights Reserved.
  * 
  * This program is free software; you can redistribute it and/or modify it 
  * under the terms of version 2 of the GNU General Public License 
@@ -66,6 +66,7 @@ extern ia64_mv_dma_sync_single_for_device sn_dma_sync_single_for_device;
 extern ia64_mv_dma_sync_sg_for_device	sn_dma_sync_sg_for_device;
 extern ia64_mv_dma_mapping_error	sn_dma_mapping_error;
 extern ia64_mv_dma_supported		sn_dma_supported;
+extern ia64_mv_migrate_t		sn_migrate;
 
 /*
  * This stuff has dual use!
@@ -115,6 +116,7 @@ extern ia64_mv_dma_supported		sn_dma_supported;
 #define platform_dma_sync_sg_for_device	sn_dma_sync_sg_for_device
 #define platform_dma_mapping_error		sn_dma_mapping_error
 #define platform_dma_supported		sn_dma_supported
+#define platform_migrate		sn_migrate
 
 #include <asm/sn/io.h>
 
diff --git a/include/asm-ia64/mca.h b/include/asm-ia64/mca.h
index c7d9c9ed38b..bfbbb8da79c 100644
--- a/include/asm-ia64/mca.h
+++ b/include/asm-ia64/mca.h
@@ -131,6 +131,8 @@ struct ia64_mca_cpu {
 /* Array of physical addresses of each CPU's MCA area.  */
 extern unsigned long __per_cpu_mca[NR_CPUS];
 
+extern int cpe_vector;
+extern int ia64_cpe_irq;
 extern void ia64_mca_init(void);
 extern void ia64_mca_cpu_init(void *);
 extern void ia64_os_mca_dispatch(void);
diff --git a/include/asm-ia64/mutex.h b/include/asm-ia64/mutex.h
index 458c1f7fbc1..5a3224f6af3 100644
--- a/include/asm-ia64/mutex.h
+++ b/include/asm-ia64/mutex.h
@@ -1,9 +1,92 @@
 /*
- * Pull in the generic implementation for the mutex fastpath.
+ * ia64 implementation of the mutex fastpath.
  *
- * TODO: implement optimized primitives instead, or leave the generic
- * implementation in place, or pick the atomic_xchg() based generic
- * implementation. (see asm-generic/mutex-xchg.h for details)
+ * Copyright (C) 2006 Ken Chen <kenneth.w.chen@intel.com>
+ *
+ */
+
+#ifndef _ASM_MUTEX_H
+#define _ASM_MUTEX_H
+
+/**
+ *  __mutex_fastpath_lock - try to take the lock by moving the count
+ *                          from 1 to a 0 value
+ *  @count: pointer of type atomic_t
+ *  @fail_fn: function to call if the original value was not 1
+ *
+ * Change the count from 1 to a value lower than 1, and call <fail_fn> if
+ * it wasn't 1 originally. This function MUST leave the value lower than
+ * 1 even when the "1" assertion wasn't true.
+ */
+static inline void
+__mutex_fastpath_lock(atomic_t *count, void (*fail_fn)(atomic_t *))
+{
+	if (unlikely(ia64_fetchadd4_acq(count, -1) != 1))
+		fail_fn(count);
+}
+
+/**
+ *  __mutex_fastpath_lock_retval - try to take the lock by moving the count
+ *                                 from 1 to a 0 value
+ *  @count: pointer of type atomic_t
+ *  @fail_fn: function to call if the original value was not 1
+ *
+ * Change the count from 1 to a value lower than 1, and call <fail_fn> if
+ * it wasn't 1 originally. This function returns 0 if the fastpath succeeds,
+ * or anything the slow path function returns.
+ */
+static inline int
+__mutex_fastpath_lock_retval(atomic_t *count, int (*fail_fn)(atomic_t *))
+{
+	if (unlikely(ia64_fetchadd4_acq(count, -1) != 1))
+		return fail_fn(count);
+	return 0;
+}
+
+/**
+ *  __mutex_fastpath_unlock - try to promote the count from 0 to 1
+ *  @count: pointer of type atomic_t
+ *  @fail_fn: function to call if the original value was not 0
+ *
+ * Try to promote the count from 0 to 1. If it wasn't 0, call <fail_fn>.
+ * In the failure case, this function is allowed to either set the value to
+ * 1, or to set it to a value lower than 1.
+ *
+ * If the implementation sets it to a value of lower than 1, then the
+ * __mutex_slowpath_needs_to_unlock() macro needs to return 1, it needs
+ * to return 0 otherwise.
+ */
+static inline void
+__mutex_fastpath_unlock(atomic_t *count, void (*fail_fn)(atomic_t *))
+{
+	int ret = ia64_fetchadd4_rel(count, 1);
+	if (unlikely(ret < 0))
+		fail_fn(count);
+}
+
+#define __mutex_slowpath_needs_to_unlock()		1
+
+/**
+ * __mutex_fastpath_trylock - try to acquire the mutex, without waiting
+ *
+ *  @count: pointer of type atomic_t
+ *  @fail_fn: fallback function
+ *
+ * Change the count from 1 to a value lower than 1, and return 0 (failure)
+ * if it wasn't 1 originally, or return 1 (success) otherwise. This function
+ * MUST leave the value lower than 1 even when the "1" assertion wasn't true.
+ * Additionally, if the value was < 0 originally, this function must not leave
+ * it to 0 on failure.
+ *
+ * If the architecture has no effective trylock variant, it should call the
+ * <fail_fn> spinlock-based trylock variant unconditionally.
  */
+static inline int
+__mutex_fastpath_trylock(atomic_t *count, int (*fail_fn)(atomic_t *))
+{
+	if (likely(cmpxchg_acq(count, 1, 0)) == 1)
+		return 1;
+	return 0;
+}
 
-#include <asm-generic/mutex-dec.h>
+#endif
diff --git a/include/asm-ia64/page.h b/include/asm-ia64/page.h
index 5e6362a786b..3ab27333dae 100644
--- a/include/asm-ia64/page.h
+++ b/include/asm-ia64/page.h
@@ -57,6 +57,8 @@
 
 # define HAVE_ARCH_HUGETLB_UNMAPPED_AREA
 # define ARCH_HAS_HUGEPAGE_ONLY_RANGE
+# define ARCH_HAS_PREPARE_HUGEPAGE_RANGE
+# define ARCH_HAS_HUGETLB_FREE_PGD_RANGE
 #endif /* CONFIG_HUGETLB_PAGE */
 
 #ifdef __ASSEMBLY__
diff --git a/include/asm-ia64/pgtable.h b/include/asm-ia64/pgtable.h
index e2560c58384..c0f8144f234 100644
--- a/include/asm-ia64/pgtable.h
+++ b/include/asm-ia64/pgtable.h
@@ -314,7 +314,7 @@ ia64_phys_addr_valid (unsigned long addr)
 #define pte_mkyoung(pte)	(__pte(pte_val(pte) | _PAGE_A))
 #define pte_mkclean(pte)	(__pte(pte_val(pte) & ~_PAGE_D))
 #define pte_mkdirty(pte)	(__pte(pte_val(pte) | _PAGE_D))
-#define pte_mkhuge(pte)		(__pte(pte_val(pte) | _PAGE_P))
+#define pte_mkhuge(pte)		(__pte(pte_val(pte)))
 
 /*
  * Macro to a page protection value as "uncacheable".  Note that "protection" is really a
@@ -505,9 +505,6 @@ extern struct page *zero_page_memmap_ptr;
 #define HUGETLB_PGDIR_SHIFT	(HPAGE_SHIFT + 2*(PAGE_SHIFT-3))
 #define HUGETLB_PGDIR_SIZE	(__IA64_UL(1) << HUGETLB_PGDIR_SHIFT)
 #define HUGETLB_PGDIR_MASK	(~(HUGETLB_PGDIR_SIZE-1))
-struct mmu_gather;
-void hugetlb_free_pgd_range(struct mmu_gather **tlb, unsigned long addr,
-		unsigned long end, unsigned long floor, unsigned long ceiling);
 #endif
 
 /*
diff --git a/include/asm-ia64/processor.h b/include/asm-ia64/processor.h
index 23c8e1be191..128fefd8056 100644
--- a/include/asm-ia64/processor.h
+++ b/include/asm-ia64/processor.h
@@ -50,7 +50,8 @@
 #define IA64_THREAD_PM_VALID	(__IA64_UL(1) << 2)	/* performance registers valid? */
 #define IA64_THREAD_UAC_NOPRINT	(__IA64_UL(1) << 3)	/* don't log unaligned accesses */
 #define IA64_THREAD_UAC_SIGBUS	(__IA64_UL(1) << 4)	/* generate SIGBUS on unaligned acc. */
-							/* bit 5 is currently unused */
+#define IA64_THREAD_MIGRATION	(__IA64_UL(1) << 5)	/* require migration
+							   sync at ctx sw */
 #define IA64_THREAD_FPEMU_NOPRINT (__IA64_UL(1) << 6)	/* don't log any fpswa faults */
 #define IA64_THREAD_FPEMU_SIGFPE  (__IA64_UL(1) << 7)	/* send a SIGFPE for fpswa faults */
 
diff --git a/include/asm-ia64/signal.h b/include/asm-ia64/signal.h
index 608168d713d..5e328ed5d01 100644
--- a/include/asm-ia64/signal.h
+++ b/include/asm-ia64/signal.h
@@ -158,8 +158,6 @@ struct k_sigaction {
 
 #define ptrace_signal_deliver(regs, cookie) do { } while (0)
 
-void set_sigdelayed(pid_t pid, int signo, int code, void __user *addr);
-
 #endif /* __KERNEL__ */
 
 # endif /* !__ASSEMBLY__ */
diff --git a/include/asm-ia64/sn/addrs.h b/include/asm-ia64/sn/addrs.h
index 2c32e4b77b5..1d9efe54166 100644
--- a/include/asm-ia64/sn/addrs.h
+++ b/include/asm-ia64/sn/addrs.h
@@ -283,5 +283,13 @@
 #define REMOTE_HUB_L(n, a)		HUB_L(REMOTE_HUB_ADDR((n), (a)))
 #define REMOTE_HUB_S(n, a, d)		HUB_S(REMOTE_HUB_ADDR((n), (a)), (d))
 
+/*
+ * Coretalk address breakdown
+ */
+#define CTALK_NASID_SHFT		40
+#define CTALK_NASID_MASK		(0x3FFFULL << CTALK_NASID_SHFT)
+#define CTALK_CID_SHFT			38
+#define CTALK_CID_MASK			(0x3ULL << CTALK_CID_SHFT)
+#define CTALK_NODE_OFFSET		0x3FFFFFFFFF
 
 #endif /* _ASM_IA64_SN_ADDRS_H */
diff --git a/include/asm-ia64/sn/rw_mmr.h b/include/asm-ia64/sn/rw_mmr.h
index f40fd1a5510..2d78f4c5a45 100644
--- a/include/asm-ia64/sn/rw_mmr.h
+++ b/include/asm-ia64/sn/rw_mmr.h
@@ -3,15 +3,14 @@
  * License.  See the file "COPYING" in the main directory of this archive
  * for more details.
  *
- * Copyright (C) 2002-2004 Silicon Graphics, Inc.  All Rights Reserved.
+ * Copyright (C) 2002-2006 Silicon Graphics, Inc.  All Rights Reserved.
  */
 #ifndef _ASM_IA64_SN_RW_MMR_H
 #define _ASM_IA64_SN_RW_MMR_H
 
 
 /*
- * This file contains macros used to access MMR registers via
- * uncached physical addresses.
+ * This file that access MMRs via uncached physical addresses.
  * 	pio_phys_read_mmr  - read an MMR
  * 	pio_phys_write_mmr - write an MMR
  * 	pio_atomic_phys_write_mmrs - atomically write 1 or 2 MMRs with psr.ic=0
@@ -22,53 +21,8 @@
  */
 
 
-extern inline long
-pio_phys_read_mmr(volatile long *mmr) 
-{
-	long val;
-        asm volatile
-            ("mov r2=psr;;"
-             "rsm psr.i | psr.dt;;"
-             "srlz.i;;"
-             "ld8.acq %0=[%1];;"
-             "mov psr.l=r2;;"
-             "srlz.i;;"
-             : "=r"(val)
-             : "r"(mmr)
-	     : "r2");
-        return val;
-}
-
-
-
-extern inline void
-pio_phys_write_mmr(volatile long *mmr, long val) 
-{
-        asm volatile
-            ("mov r2=psr;;"
-             "rsm psr.i | psr.dt;;"
-             "srlz.i;;"
-             "st8.rel [%0]=%1;;"
-             "mov psr.l=r2;;"
-             "srlz.i;;"
-	     :: "r"(mmr), "r"(val)
-             : "r2", "memory");
-}            
-
-extern inline void
-pio_atomic_phys_write_mmrs(volatile long *mmr1, long val1, volatile long *mmr2, long val2) 
-{
-        asm volatile
-            ("mov r2=psr;;"
-             "rsm psr.i | psr.dt | psr.ic;;"
-	     "cmp.ne p9,p0=%2,r0;"
-             "srlz.i;;"
-             "st8.rel [%0]=%1;"
-             "(p9) st8.rel [%2]=%3;;"
-             "mov psr.l=r2;;"
-             "srlz.i;;"
-	     :: "r"(mmr1), "r"(val1), "r"(mmr2), "r"(val2)
-             : "p9", "r2", "memory");
-}            
+extern long pio_phys_read_mmr(volatile long *mmr); 
+extern void pio_phys_write_mmr(volatile long *mmr, long val);
+extern void pio_atomic_phys_write_mmrs(volatile long *mmr1, long val1, volatile long *mmr2, long val2); 
 
 #endif /* _ASM_IA64_SN_RW_MMR_H */
diff --git a/include/asm-ia64/sn/tioce.h b/include/asm-ia64/sn/tioce.h
index d4c990712ea..893468e1b41 100644
--- a/include/asm-ia64/sn/tioce.h
+++ b/include/asm-ia64/sn/tioce.h
@@ -11,7 +11,7 @@
 
 /* CE ASIC part & mfgr information  */
 #define TIOCE_PART_NUM			0xCE00
-#define TIOCE_MFGR_NUM			0x36
+#define TIOCE_SRC_ID			0x01
 #define TIOCE_REV_A			0x1
 
 /* CE Virtual PPB Vendor/Device IDs */
@@ -20,7 +20,7 @@
 
 /* CE Host Bridge Vendor/Device IDs */
 #define CE_HOST_BRIDGE_VENDOR_ID	0x10a9
-#define CE_HOST_BRIDGE_DEVICE_ID	0x4003
+#define CE_HOST_BRIDGE_DEVICE_ID	0x4001
 
 
 #define TIOCE_NUM_M40_ATES		4096
@@ -463,6 +463,25 @@ typedef volatile struct tioce {
 	u64	ce_end_of_struct;			/* 0x044400 */
 } tioce_t;
 
+/* ce_lsiX_gb_cfg1 register bit masks & shifts */
+#define CE_LSI_GB_CFG1_RXL0S_THS_SHFT	0
+#define CE_LSI_GB_CFG1_RXL0S_THS_MASK	(0xffULL << 0)
+#define CE_LSI_GB_CFG1_RXL0S_SMP_SHFT	8
+#define CE_LSI_GB_CFG1_RXL0S_SMP_MASK	(0xfULL << 8);
+#define CE_LSI_GB_CFG1_RXL0S_ADJ_SHFT	12
+#define CE_LSI_GB_CFG1_RXL0S_ADJ_MASK	(0x7ULL << 12)
+#define CE_LSI_GB_CFG1_RXL0S_FLT_SHFT	15
+#define CE_LSI_GB_CFG1_RXL0S_FLT_MASK	(0x1ULL << 15)
+#define CE_LSI_GB_CFG1_LPBK_SEL_SHFT	16
+#define CE_LSI_GB_CFG1_LPBK_SEL_MASK	(0x3ULL << 16)
+#define CE_LSI_GB_CFG1_LPBK_EN_SHFT	18
+#define CE_LSI_GB_CFG1_LPBK_EN_MASK	(0x1ULL << 18)
+#define CE_LSI_GB_CFG1_RVRS_LB_SHFT	19
+#define CE_LSI_GB_CFG1_RVRS_LB_MASK	(0x1ULL << 19)
+#define CE_LSI_GB_CFG1_RVRS_CLK_SHFT	20
+#define CE_LSI_GB_CFG1_RVRS_CLK_MASK	(0x3ULL << 20)
+#define CE_LSI_GB_CFG1_SLF_TS_SHFT	24
+#define CE_LSI_GB_CFG1_SLF_TS_MASK	(0xfULL << 24)
 
 /* ce_adm_int_mask/ce_adm_int_status register bit defines */
 #define CE_ADM_INT_CE_ERROR_SHFT		0
@@ -592,6 +611,11 @@ typedef volatile struct tioce {
 #define CE_URE_RD_MRG_ENABLE		(0x1ULL << 0)
 #define CE_URE_WRT_MRG_ENABLE1		(0x1ULL << 4)
 #define CE_URE_WRT_MRG_ENABLE2		(0x1ULL << 5)
+#define CE_URE_WRT_MRG_TIMER_SHFT	12
+#define CE_URE_WRT_MRG_TIMER_MASK	(0x7FFULL << CE_URE_WRT_MRG_TIMER_SHFT)
+#define CE_URE_WRT_MRG_TIMER(x)		(((u64)(x) << \
+					  CE_URE_WRT_MRG_TIMER_SHFT) & \
+					 CE_URE_WRT_MRG_TIMER_MASK)
 #define CE_URE_RSPQ_BYPASS_DISABLE	(0x1ULL << 24)
 #define CE_URE_UPS_DAT1_PAR_DISABLE	(0x1ULL << 32)
 #define CE_URE_UPS_HDR1_PAR_DISABLE	(0x1ULL << 33)
@@ -653,8 +677,12 @@ typedef volatile struct tioce {
 #define CE_URE_SI			(0x1ULL << 0)
 #define CE_URE_ELAL_SHFT		4
 #define CE_URE_ELAL_MASK		(0x7ULL << CE_URE_ELAL_SHFT)
+#define CE_URE_ELAL_SET(n)		(((u64)(n) << CE_URE_ELAL_SHFT) & \
+					 CE_URE_ELAL_MASK)
 #define CE_URE_ELAL1_SHFT		8
 #define CE_URE_ELAL1_MASK		(0x7ULL << CE_URE_ELAL1_SHFT)
+#define CE_URE_ELAL1_SET(n)		(((u64)(n) << CE_URE_ELAL1_SHFT) & \
+					 CE_URE_ELAL1_MASK)
 #define CE_URE_SCC			(0x1ULL << 12)
 #define CE_URE_PN1_SHFT			16
 #define CE_URE_PN1_MASK			(0xFFULL << CE_URE_PN1_SHFT)
@@ -675,8 +703,12 @@ typedef volatile struct tioce {
 #define CE_URE_HPC			(0x1ULL << 6)
 #define CE_URE_SPLV_SHFT		7
 #define CE_URE_SPLV_MASK		(0xFFULL << CE_URE_SPLV_SHFT)
+#define CE_URE_SPLV_SET(n)		(((u64)(n) << CE_URE_SPLV_SHFT) & \
+					 CE_URE_SPLV_MASK)
 #define CE_URE_SPLS_SHFT		15
 #define CE_URE_SPLS_MASK		(0x3ULL << CE_URE_SPLS_SHFT)
+#define CE_URE_SPLS_SET(n)		(((u64)(n) << CE_URE_SPLS_SHFT) & \
+					 CE_URE_SPLS_MASK)
 #define CE_URE_PSN1_SHFT		19
 #define CE_URE_PSN1_MASK		(0x1FFFULL << CE_URE_PSN1_SHFT)
 #define CE_URE_PSN2_SHFT		32
diff --git a/include/asm-ia64/sn/xpc.h b/include/asm-ia64/sn/xpc.h
index df7f5f4f3cd..aa3b8ace903 100644
--- a/include/asm-ia64/sn/xpc.h
+++ b/include/asm-ia64/sn/xpc.h
@@ -1227,28 +1227,6 @@ xpc_map_bte_errors(bte_result_t error)
 
 
 
-static inline void *
-xpc_kmalloc_cacheline_aligned(size_t size, gfp_t flags, void **base)
-{
-	/* see if kmalloc will give us cachline aligned memory by default */
-	*base = kmalloc(size, flags);
-	if (*base == NULL) {
-		return NULL;
-	}
-	if ((u64) *base == L1_CACHE_ALIGN((u64) *base)) {
-		return *base;
-	}
-	kfree(*base);
-
-	/* nope, we'll have to do it ourselves */
-	*base = kmalloc(size + L1_CACHE_BYTES, flags);
-	if (*base == NULL) {
-		return NULL;
-	}
-	return (void *) L1_CACHE_ALIGN((u64) *base);
-}
-
-
 /*
  * Check to see if there is any channel activity to/from the specified
  * partition.
diff --git a/include/asm-ia64/system.h b/include/asm-ia64/system.h
index 06253871562..cd4233d66f1 100644
--- a/include/asm-ia64/system.h
+++ b/include/asm-ia64/system.h
@@ -244,6 +244,13 @@ extern void ia64_load_extra (struct task_struct *task);
 		__ia64_save_fpu((prev)->thread.fph);				\
 	}									\
 	__switch_to(prev, next, last);						\
+	/* "next" in old context is "current" in new context */			\
+	if (unlikely((current->thread.flags & IA64_THREAD_MIGRATION) &&	       \
+		     (task_cpu(current) !=				       \
+		      		      task_thread_info(current)->last_cpu))) { \
+		platform_migrate(current);				       \
+		task_thread_info(current)->last_cpu = task_cpu(current);       \
+	}								       \
 } while (0)
 #else
 # define switch_to(prev,next,last)	__switch_to(prev, next, last)
diff --git a/include/asm-ia64/thread_info.h b/include/asm-ia64/thread_info.h
index 1d6518fe1f0..56394a2c705 100644
--- a/include/asm-ia64/thread_info.h
+++ b/include/asm-ia64/thread_info.h
@@ -26,16 +26,10 @@ struct thread_info {
 	struct exec_domain *exec_domain;/* execution domain */
 	__u32 flags;			/* thread_info flags (see TIF_*) */
 	__u32 cpu;			/* current CPU */
+	__u32 last_cpu;			/* Last CPU thread ran on */
 	mm_segment_t addr_limit;	/* user-level address space limit */
 	int preempt_count;		/* 0=premptable, <0=BUG; will also serve as bh-counter */
 	struct restart_block restart_block;
-	struct {
-		int signo;
-		int code;
-		void __user *addr;
-		unsigned long start_time;
-		pid_t pid;
-	} sigdelayed;			/* Saved information for TIF_SIGDELAYED */
 };
 
 #define THREAD_SIZE			KERNEL_STACK_SIZE
@@ -89,7 +83,6 @@ struct thread_info {
 #define TIF_NEED_RESCHED	2	/* rescheduling necessary */
 #define TIF_SYSCALL_TRACE	3	/* syscall trace active */
 #define TIF_SYSCALL_AUDIT	4	/* syscall auditing active */
-#define TIF_SIGDELAYED		5	/* signal delayed from MCA/INIT/NMI/PMI context */
 #define TIF_POLLING_NRFLAG	16	/* true if poll_idle() is polling TIF_NEED_RESCHED */
 #define TIF_MEMDIE		17
 #define TIF_MCA_INIT		18	/* this task is processing MCA or INIT */
@@ -101,13 +94,12 @@ struct thread_info {
 #define _TIF_NOTIFY_RESUME	(1 << TIF_NOTIFY_RESUME)
 #define _TIF_SIGPENDING		(1 << TIF_SIGPENDING)
 #define _TIF_NEED_RESCHED	(1 << TIF_NEED_RESCHED)
-#define _TIF_SIGDELAYED		(1 << TIF_SIGDELAYED)
 #define _TIF_POLLING_NRFLAG	(1 << TIF_POLLING_NRFLAG)
 #define _TIF_MCA_INIT		(1 << TIF_MCA_INIT)
 #define _TIF_DB_DISABLED	(1 << TIF_DB_DISABLED)
 
 /* "work to do on user-return" bits */
-#define TIF_ALLWORK_MASK	(_TIF_NOTIFY_RESUME|_TIF_SIGPENDING|_TIF_NEED_RESCHED|_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SIGDELAYED)
+#define TIF_ALLWORK_MASK	(_TIF_NOTIFY_RESUME|_TIF_SIGPENDING|_TIF_NEED_RESCHED|_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT)
 /* like TIF_ALLWORK_BITS but sans TIF_SYSCALL_TRACE or TIF_SYSCALL_AUDIT */
 #define TIF_WORK_MASK		(TIF_ALLWORK_MASK&~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT))
 
diff --git a/include/asm-powerpc/pgtable.h b/include/asm-powerpc/pgtable.h
index e38931379a7..185ee15963a 100644
--- a/include/asm-powerpc/pgtable.h
+++ b/include/asm-powerpc/pgtable.h
@@ -468,11 +468,6 @@ extern pgd_t swapper_pg_dir[];
 
 extern void paging_init(void);
 
-#ifdef CONFIG_HUGETLB_PAGE
-#define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) \
-	free_pgd_range(tlb, addr, end, floor, ceiling)
-#endif
-
 /*
  * This gets called at the end of handling a page fault, when
  * the kernel has put a new PTE into the page table for the process.
diff --git a/include/asm-s390/pgalloc.h b/include/asm-s390/pgalloc.h
index 3417dd71ab4..e28aaf28e4a 100644
--- a/include/asm-s390/pgalloc.h
+++ b/include/asm-s390/pgalloc.h
@@ -158,11 +158,4 @@ static inline void pte_free(struct page *pte)
 
 #define __pte_free_tlb(tlb,pte) tlb_remove_page(tlb,pte)
 
-/*
- * This establishes kernel virtual mappings (e.g., as a result of a
- * vmalloc call).  Since s390-esame uses a separate kernel page table,
- * there is nothing to do here... :)
- */
-#define set_pgdir(addr,entry) do { } while(0)
-
 #endif /* _S390_PGALLOC_H */
diff --git a/include/asm-sh64/pgalloc.h b/include/asm-sh64/pgalloc.h
index 678251ac1db..b29dd468817 100644
--- a/include/asm-sh64/pgalloc.h
+++ b/include/asm-sh64/pgalloc.h
@@ -167,22 +167,6 @@ static __inline__ void pmd_free(pmd_t *pmd)
 
 extern int do_check_pgt_cache(int, int);
 
-static inline void set_pgdir(unsigned long address, pgd_t entry)
-{
-	struct task_struct * p;
-	pgd_t *pgd;
-
-	read_lock(&tasklist_lock);
-	for_each_process(p) {
-		if (!p->mm)
-			continue;
-		*pgd_offset(p->mm,address) = entry;
-	}
-	read_unlock(&tasklist_lock);
-	for (pgd = (pgd_t *)pgd_quicklist; pgd; pgd = (pgd_t *)*(unsigned long *)pgd)
-		pgd[address >> PGDIR_SHIFT] = entry;
-}
-
 #define pmd_populate_kernel(mm, pmd, pte) \
 	set_pmd(pmd, __pmd(_PAGE_TABLE + (unsigned long) (pte)))
 
diff --git a/include/asm-x86_64/pgtable.h b/include/asm-x86_64/pgtable.h
index 715fd94cf57..a617d364d08 100644
--- a/include/asm-x86_64/pgtable.h
+++ b/include/asm-x86_64/pgtable.h
@@ -273,7 +273,7 @@ static inline int pte_dirty(pte_t pte)		{ return pte_val(pte) & _PAGE_DIRTY; }
 static inline int pte_young(pte_t pte)		{ return pte_val(pte) & _PAGE_ACCESSED; }
 static inline int pte_write(pte_t pte)		{ return pte_val(pte) & _PAGE_RW; }
 static inline int pte_file(pte_t pte)		{ return pte_val(pte) & _PAGE_FILE; }
-static inline int pte_huge(pte_t pte)		{ return (pte_val(pte) & __LARGE_PTE) == __LARGE_PTE; }
+static inline int pte_huge(pte_t pte)		{ return pte_val(pte) & _PAGE_PSE; }
 
 static inline pte_t pte_rdprotect(pte_t pte)	{ set_pte(&pte, __pte(pte_val(pte) & ~_PAGE_USER)); return pte; }
 static inline pte_t pte_exprotect(pte_t pte)	{ set_pte(&pte, __pte(pte_val(pte) & ~_PAGE_USER)); return pte; }
@@ -285,7 +285,7 @@ static inline pte_t pte_mkexec(pte_t pte)	{ set_pte(&pte, __pte(pte_val(pte) | _
 static inline pte_t pte_mkdirty(pte_t pte)	{ set_pte(&pte, __pte(pte_val(pte) | _PAGE_DIRTY)); return pte; }
 static inline pte_t pte_mkyoung(pte_t pte)	{ set_pte(&pte, __pte(pte_val(pte) | _PAGE_ACCESSED)); return pte; }
 static inline pte_t pte_mkwrite(pte_t pte)	{ set_pte(&pte, __pte(pte_val(pte) | _PAGE_RW)); return pte; }
-static inline pte_t pte_mkhuge(pte_t pte)	{ set_pte(&pte, __pte(pte_val(pte) | __LARGE_PTE)); return pte; }
+static inline pte_t pte_mkhuge(pte_t pte)	{ set_pte(&pte, __pte(pte_val(pte) | _PAGE_PSE)); return pte; }
 
 struct vm_area_struct;
 
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 68d82ad6b17..d6f1019625a 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -20,10 +20,7 @@ void unmap_hugepage_range(struct vm_area_struct *, unsigned long, unsigned long)
 int hugetlb_prefault(struct address_space *, struct vm_area_struct *);
 int hugetlb_report_meminfo(char *);
 int hugetlb_report_node_meminfo(int, char *);
-int is_hugepage_mem_enough(size_t);
 unsigned long hugetlb_total_pages(void);
-struct page *alloc_huge_page(struct vm_area_struct *, unsigned long);
-void free_huge_page(struct page *);
 int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 			unsigned long address, int write_access);
 
@@ -39,18 +36,35 @@ struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address,
 			      int write);
 struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
 				pmd_t *pmd, int write);
-int is_aligned_hugepage_range(unsigned long addr, unsigned long len);
 int pmd_huge(pmd_t pmd);
+void hugetlb_change_protection(struct vm_area_struct *vma,
+		unsigned long address, unsigned long end, pgprot_t newprot);
 
 #ifndef ARCH_HAS_HUGEPAGE_ONLY_RANGE
 #define is_hugepage_only_range(mm, addr, len)	0
-#define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) \
-						do { } while (0)
+#endif
+
+#ifndef ARCH_HAS_HUGETLB_FREE_PGD_RANGE
+#define hugetlb_free_pgd_range	free_pgd_range
+#else
+void hugetlb_free_pgd_range(struct mmu_gather **tlb, unsigned long addr,
+			    unsigned long end, unsigned long floor,
+			    unsigned long ceiling);
 #endif
 
 #ifndef ARCH_HAS_PREPARE_HUGEPAGE_RANGE
-#define prepare_hugepage_range(addr, len)	\
-	is_aligned_hugepage_range(addr, len)
+/*
+ * If the arch doesn't supply something else, assume that hugepage
+ * size aligned regions are ok without further preparation.
+ */
+static inline int prepare_hugepage_range(unsigned long addr, unsigned long len)
+{
+	if (len & ~HPAGE_MASK)
+		return -EINVAL;
+	if (addr & ~HPAGE_MASK)
+		return -EINVAL;
+	return 0;
+}
 #else
 int prepare_hugepage_range(unsigned long addr, unsigned long len);
 #endif
@@ -87,20 +101,17 @@ static inline unsigned long hugetlb_total_pages(void)
 #define copy_hugetlb_page_range(src, dst, vma)	({ BUG(); 0; })
 #define hugetlb_prefault(mapping, vma)		({ BUG(); 0; })
 #define unmap_hugepage_range(vma, start, end)	BUG()
-#define is_hugepage_mem_enough(size)		0
 #define hugetlb_report_meminfo(buf)		0
 #define hugetlb_report_node_meminfo(n, buf)	0
 #define follow_huge_pmd(mm, addr, pmd, write)	NULL
-#define is_aligned_hugepage_range(addr, len)	0
 #define prepare_hugepage_range(addr, len)	(-EINVAL)
 #define pmd_huge(x)	0
 #define is_hugepage_only_range(mm, addr, len)	0
-#define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) \
-						do { } while (0)
-#define alloc_huge_page(vma, addr)		({ NULL; })
-#define free_huge_page(p)			({ (void)(p); BUG(); })
+#define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) ({BUG(); 0; })
 #define hugetlb_fault(mm, vma, addr, write)	({ BUG(); 0; })
 
+#define hugetlb_change_protection(vma, address, end, newprot)
+
 #ifndef HPAGE_MASK
 #define HPAGE_MASK	PAGE_MASK		/* Keep the compiler happy */
 #define HPAGE_SIZE	PAGE_SIZE
@@ -128,6 +139,8 @@ struct hugetlbfs_sb_info {
 
 struct hugetlbfs_inode_info {
 	struct shared_policy policy;
+	/* Protected by the (global) hugetlb_lock */
+	unsigned long prereserved_hpages;
 	struct inode vfs_inode;
 };
 
@@ -144,6 +157,10 @@ static inline struct hugetlbfs_sb_info *HUGETLBFS_SB(struct super_block *sb)
 extern struct file_operations hugetlbfs_file_operations;
 extern struct vm_operations_struct hugetlb_vm_ops;
 struct file *hugetlb_zero_setup(size_t);
+int hugetlb_extend_reservation(struct hugetlbfs_inode_info *info,
+			       unsigned long atleast_hpages);
+void hugetlb_truncate_reservation(struct hugetlbfs_inode_info *info,
+				  unsigned long atmost_hpages);
 int hugetlb_get_quota(struct address_space *mapping);
 void hugetlb_put_quota(struct address_space *mapping);
 
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
new file mode 100644
index 00000000000..7d09962c3c0
--- /dev/null
+++ b/include/linux/migrate.h
@@ -0,0 +1,36 @@
+#ifndef _LINUX_MIGRATE_H
+#define _LINUX_MIGRATE_H
+
+#include <linux/config.h>
+#include <linux/mm.h>
+
+#ifdef CONFIG_MIGRATION
+extern int isolate_lru_page(struct page *p, struct list_head *pagelist);
+extern int putback_lru_pages(struct list_head *l);
+extern int migrate_page(struct page *, struct page *);
+extern void migrate_page_copy(struct page *, struct page *);
+extern int migrate_page_remove_references(struct page *, struct page *, int);
+extern int migrate_pages(struct list_head *l, struct list_head *t,
+		struct list_head *moved, struct list_head *failed);
+int migrate_pages_to(struct list_head *pagelist,
+			struct vm_area_struct *vma, int dest);
+extern int fail_migrate_page(struct page *, struct page *);
+
+extern int migrate_prep(void);
+
+#else
+
+static inline int isolate_lru_page(struct page *p, struct list_head *list)
+					{ return -ENOSYS; }
+static inline int putback_lru_pages(struct list_head *l) { return 0; }
+static inline int migrate_pages(struct list_head *l, struct list_head *t,
+	struct list_head *moved, struct list_head *failed) { return -ENOSYS; }
+
+static inline int migrate_prep(void) { return -ENOSYS; }
+
+/* Possible settings for the migrate_page() method in address_operations */
+#define migrate_page NULL
+#define fail_migrate_page NULL
+
+#endif /* CONFIG_MIGRATION */
+#endif /* _LINUX_MIGRATE_H */
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 498ff8778fb..6aa016f1d3a 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -286,43 +286,34 @@ struct page {
  *
  * Also, many kernel routines increase the page count before a critical
  * routine so they can be sure the page doesn't go away from under them.
- *
- * Since 2.6.6 (approx), a free page has ->_count = -1.  This is so that we
- * can use atomic_add_negative(-1, page->_count) to detect when the page
- * becomes free and so that we can also use atomic_inc_and_test to atomically
- * detect when we just tried to grab a ref on a page which some other CPU has
- * already deemed to be freeable.
- *
- * NO code should make assumptions about this internal detail!  Use the provided
- * macros which retain the old rules: page_count(page) == 0 is a free page.
  */
 
 /*
  * Drop a ref, return true if the logical refcount fell to zero (the page has
  * no users)
  */
-#define put_page_testzero(p)				\
-	({						\
-		BUG_ON(atomic_read(&(p)->_count) == -1);\
-		atomic_add_negative(-1, &(p)->_count);	\
-	})
+static inline int put_page_testzero(struct page *page)
+{
+	BUG_ON(atomic_read(&page->_count) == 0);
+	return atomic_dec_and_test(&page->_count);
+}
 
 /*
- * Grab a ref, return true if the page previously had a logical refcount of
- * zero.  ie: returns true if we just grabbed an already-deemed-to-be-free page
+ * Try to grab a ref unless the page has a refcount of zero, return false if
+ * that is the case.
  */
-#define get_page_testone(p)	atomic_inc_and_test(&(p)->_count)
-
-#define set_page_count(p,v) 	atomic_set(&(p)->_count, (v) - 1)
-#define __put_page(p)		atomic_dec(&(p)->_count)
+static inline int get_page_unless_zero(struct page *page)
+{
+	return atomic_inc_not_zero(&page->_count);
+}
 
 extern void FASTCALL(__page_cache_release(struct page *));
 
 static inline int page_count(struct page *page)
 {
-	if (PageCompound(page))
+	if (unlikely(PageCompound(page)))
 		page = (struct page *)page_private(page);
-	return atomic_read(&page->_count) + 1;
+	return atomic_read(&page->_count);
 }
 
 static inline void get_page(struct page *page)
@@ -332,8 +323,19 @@ static inline void get_page(struct page *page)
 	atomic_inc(&page->_count);
 }
 
+/*
+ * Setup the page count before being freed into the page allocator for
+ * the first time (boot or memory hotplug)
+ */
+static inline void init_page_count(struct page *page)
+{
+	atomic_set(&page->_count, 1);
+}
+
 void put_page(struct page *page);
 
+void split_page(struct page *page, unsigned int order);
+
 /*
  * Multiple processes may "see" the same page. E.g. for untouched
  * mappings of /dev/null, all processes see the same page full of
@@ -1046,7 +1048,7 @@ int in_gate_area_no_task(unsigned long addr);
 
 int drop_caches_sysctl_handler(struct ctl_table *, int, struct file *,
 					void __user *, size_t *, loff_t *);
-int shrink_slab(unsigned long scanned, gfp_t gfp_mask,
+unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
 			unsigned long lru_pages);
 void drop_pagecache(void);
 void drop_slab(void);
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index 8ac854f7f19..3b6723dfaff 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -32,7 +32,7 @@ del_page_from_lru(struct zone *zone, struct page *page)
 {
 	list_del(&page->lru);
 	if (PageActive(page)) {
-		ClearPageActive(page);
+		__ClearPageActive(page);
 		zone->nr_active--;
 	} else {
 		zone->nr_inactive--;
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index d52999c4333..9ea629c02a4 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -86,8 +86,9 @@
  * - The __xxx_page_state variants can be used safely when interrupts are
  * disabled.
  * - The __xxx_page_state variants can be used if the field is only
- * modified from process context, or only modified from interrupt context.
- * In this case, the field should be commented here.
+ * modified from process context and protected from preemption, or only
+ * modified from interrupt context.  In this case, the field should be
+ * commented here.
  */
 struct page_state {
 	unsigned long nr_dirty;		/* Dirty writeable pages */
@@ -239,22 +240,19 @@ extern void __mod_page_state_offset(unsigned long offset, unsigned long delta);
 #define __ClearPageDirty(page)	__clear_bit(PG_dirty, &(page)->flags)
 #define TestClearPageDirty(page) test_and_clear_bit(PG_dirty, &(page)->flags)
 
-#define SetPageLRU(page)	set_bit(PG_lru, &(page)->flags)
 #define PageLRU(page)		test_bit(PG_lru, &(page)->flags)
-#define TestSetPageLRU(page)	test_and_set_bit(PG_lru, &(page)->flags)
-#define TestClearPageLRU(page)	test_and_clear_bit(PG_lru, &(page)->flags)
+#define SetPageLRU(page)	set_bit(PG_lru, &(page)->flags)
+#define ClearPageLRU(page)	clear_bit(PG_lru, &(page)->flags)
+#define __ClearPageLRU(page)	__clear_bit(PG_lru, &(page)->flags)
 
 #define PageActive(page)	test_bit(PG_active, &(page)->flags)
 #define SetPageActive(page)	set_bit(PG_active, &(page)->flags)
 #define ClearPageActive(page)	clear_bit(PG_active, &(page)->flags)
-#define TestClearPageActive(page) test_and_clear_bit(PG_active, &(page)->flags)
-#define TestSetPageActive(page) test_and_set_bit(PG_active, &(page)->flags)
+#define __ClearPageActive(page)	__clear_bit(PG_active, &(page)->flags)
 
 #define PageSlab(page)		test_bit(PG_slab, &(page)->flags)
-#define SetPageSlab(page)	set_bit(PG_slab, &(page)->flags)
-#define ClearPageSlab(page)	clear_bit(PG_slab, &(page)->flags)
-#define TestClearPageSlab(page)	test_and_clear_bit(PG_slab, &(page)->flags)
-#define TestSetPageSlab(page)	test_and_set_bit(PG_slab, &(page)->flags)
+#define __SetPageSlab(page)	__set_bit(PG_slab, &(page)->flags)
+#define __ClearPageSlab(page)	__clear_bit(PG_slab, &(page)->flags)
 
 #ifdef CONFIG_HIGHMEM
 #define PageHighMem(page)	is_highmem(page_zone(page))
@@ -329,8 +327,8 @@ extern void __mod_page_state_offset(unsigned long offset, unsigned long delta);
 #define TestClearPageReclaim(page) test_and_clear_bit(PG_reclaim, &(page)->flags)
 
 #define PageCompound(page)	test_bit(PG_compound, &(page)->flags)
-#define SetPageCompound(page)	set_bit(PG_compound, &(page)->flags)
-#define ClearPageCompound(page)	clear_bit(PG_compound, &(page)->flags)
+#define __SetPageCompound(page)	__set_bit(PG_compound, &(page)->flags)
+#define __ClearPageCompound(page) __clear_bit(PG_compound, &(page)->flags)
 
 #ifdef CONFIG_SWAP
 #define PageSwapCache(page)	test_bit(PG_swapcache, &(page)->flags)
diff --git a/include/linux/rtc.h b/include/linux/rtc.h
index 0b2ba67ff13..b739ac1f7ca 100644
--- a/include/linux/rtc.h
+++ b/include/linux/rtc.h
@@ -11,8 +11,6 @@
 #ifndef _LINUX_RTC_H_
 #define _LINUX_RTC_H_
 
-#include <linux/interrupt.h>
-
 /*
  * The struct used to pass data via the following ioctl. Similar to the
  * struct tm in <time.h>, but it needs to be here so that the kernel 
@@ -95,6 +93,8 @@ struct rtc_pll_info {
 
 #ifdef __KERNEL__
 
+#include <linux/interrupt.h>
+
 typedef struct rtc_task {
 	void (*func)(void *private_data);
 	void *private_data;
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 8cf52939d0a..2b28c849d75 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -38,7 +38,6 @@ typedef struct kmem_cache kmem_cache_t;
 #define	SLAB_DEBUG_INITIAL	0x00000200UL	/* Call constructor (as verifier) */
 #define	SLAB_RED_ZONE		0x00000400UL	/* Red zone objs in a cache */
 #define	SLAB_POISON		0x00000800UL	/* Poison objects */
-#define	SLAB_NO_REAP		0x00001000UL	/* never reap from the cache */
 #define	SLAB_HWCACHE_ALIGN	0x00002000UL	/* align objs on a h/w cache lines */
 #define SLAB_CACHE_DMA		0x00004000UL	/* use GFP_DMA memory */
 #define SLAB_MUST_HWCACHE_ALIGN	0x00008000UL	/* force alignment */
@@ -118,7 +117,7 @@ extern void *kzalloc(size_t, gfp_t);
  */
 static inline void *kcalloc(size_t n, size_t size, gfp_t flags)
 {
-	if (n != 0 && size > INT_MAX / n)
+	if (n != 0 && size > ULONG_MAX / n)
 		return NULL;
 	return kzalloc(n * size, flags);
 }
diff --git a/include/linux/smp.h b/include/linux/smp.h
index 44153fdf73f..d699a16b0cb 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -52,23 +52,12 @@ extern void smp_cpus_done(unsigned int max_cpus);
 /*
  * Call a function on all other processors
  */
-extern int smp_call_function (void (*func) (void *info), void *info,
-			      int retry, int wait);
+int smp_call_function(void(*func)(void *info), void *info, int retry, int wait);
 
 /*
  * Call a function on all processors
  */
-static inline int on_each_cpu(void (*func) (void *info), void *info,
-			      int retry, int wait)
-{
-	int ret = 0;
-
-	preempt_disable();
-	ret = smp_call_function(func, info, retry, wait);
-	func(info);
-	preempt_enable();
-	return ret;
-}
+int on_each_cpu(void (*func) (void *info), void *info, int retry, int wait);
 
 #define MSG_ALL_BUT_SELF	0x8000	/* Assume <32768 CPU's */
 #define MSG_ALL			0x8001
@@ -94,7 +83,13 @@ void smp_prepare_boot_cpu(void);
 #define raw_smp_processor_id()			0
 #define hard_smp_processor_id()			0
 #define smp_call_function(func,info,retry,wait)	({ 0; })
-#define on_each_cpu(func,info,retry,wait)	({ func(info); 0; })
+#define on_each_cpu(func,info,retry,wait)	\
+	({					\
+		local_irq_disable();		\
+		func(info);			\
+		local_irq_enable();		\
+		0;				\
+	})
 static inline void smp_send_reschedule(int cpu) { }
 #define num_booting_cpus()			1
 #define smp_prepare_boot_cpu()			do {} while (0)
diff --git a/include/linux/swap.h b/include/linux/swap.h
index d572b19afb7..12415dd9445 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -172,9 +172,24 @@ extern int rotate_reclaimable_page(struct page *page);
 extern void swap_setup(void);
 
 /* linux/mm/vmscan.c */
-extern int try_to_free_pages(struct zone **, gfp_t);
-extern int shrink_all_memory(int);
+extern unsigned long try_to_free_pages(struct zone **, gfp_t);
+extern unsigned long shrink_all_memory(unsigned long nr_pages);
 extern int vm_swappiness;
+extern int remove_mapping(struct address_space *mapping, struct page *page);
+
+/* possible outcome of pageout() */
+typedef enum {
+	/* failed to write page out, page is locked */
+	PAGE_KEEP,
+	/* move page to the active list, page is locked */
+	PAGE_ACTIVATE,
+	/* page has been sent to the disk successfully, page is unlocked */
+	PAGE_SUCCESS,
+	/* page is clean and locked */
+	PAGE_CLEAN,
+} pageout_t;
+
+extern pageout_t pageout(struct page *page, struct address_space *mapping);
 
 #ifdef CONFIG_NUMA
 extern int zone_reclaim_mode;
@@ -188,25 +203,6 @@ static inline int zone_reclaim(struct zone *z, gfp_t mask, unsigned int order)
 }
 #endif
 
-#ifdef CONFIG_MIGRATION
-extern int isolate_lru_page(struct page *p);
-extern int putback_lru_pages(struct list_head *l);
-extern int migrate_page(struct page *, struct page *);
-extern void migrate_page_copy(struct page *, struct page *);
-extern int migrate_page_remove_references(struct page *, struct page *, int);
-extern int migrate_pages(struct list_head *l, struct list_head *t,
-		struct list_head *moved, struct list_head *failed);
-extern int fail_migrate_page(struct page *, struct page *);
-#else
-static inline int isolate_lru_page(struct page *p) { return -ENOSYS; }
-static inline int putback_lru_pages(struct list_head *l) { return 0; }
-static inline int migrate_pages(struct list_head *l, struct list_head *t,
-	struct list_head *moved, struct list_head *failed) { return -ENOSYS; }
-/* Possible settings for the migrate_page() method in address_operations */
-#define migrate_page NULL
-#define fail_migrate_page NULL
-#endif
-
 #ifdef CONFIG_MMU
 /* linux/mm/shmem.c */
 extern int shmem_unuse(swp_entry_t entry, struct page *page);
diff --git a/kernel/fork.c b/kernel/fork.c
index b373322ca49..9bd7b65ee41 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1534,6 +1534,12 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
 
 	check_unshare_flags(&unshare_flags);
 
+	/* Return -EINVAL for all unsupported flags */
+	err = -EINVAL;
+	if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
+				CLONE_VM|CLONE_FILES|CLONE_SYSVSEM))
+		goto bad_unshare_out;
+
 	if ((err = unshare_thread(unshare_flags)))
 		goto bad_unshare_out;
 	if ((err = unshare_fs(unshare_flags, &new_fs)))
diff --git a/kernel/sched.c b/kernel/sched.c
index 4d46e90f59c..6b6e0d70eb3 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -707,12 +707,6 @@ static int recalc_task_prio(task_t *p, unsigned long long now)
 						DEF_TIMESLICE);
 		} else {
 			/*
-			 * The lower the sleep avg a task has the more
-			 * rapidly it will rise with sleep time.
-			 */
-			sleep_time *= (MAX_BONUS - CURRENT_BONUS(p)) ? : 1;
-
-			/*
 			 * Tasks waking from uninterruptible sleep are
 			 * limited in their sleep_avg rise as they
 			 * are likely to be waiting on I/O
diff --git a/kernel/softirq.c b/kernel/softirq.c
index ad3295cdded..ec8fed42a86 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -16,6 +16,7 @@
 #include <linux/cpu.h>
 #include <linux/kthread.h>
 #include <linux/rcupdate.h>
+#include <linux/smp.h>
 
 #include <asm/irq.h>
 /*
@@ -495,3 +496,22 @@ __init int spawn_ksoftirqd(void)
 	register_cpu_notifier(&cpu_nfb);
 	return 0;
 }
+
+#ifdef CONFIG_SMP
+/*
+ * Call a function on all processors
+ */
+int on_each_cpu(void (*func) (void *info), void *info, int retry, int wait)
+{
+	int ret = 0;
+
+	preempt_disable();
+	ret = smp_call_function(func, info, retry, wait);
+	local_irq_disable();
+	func(info);
+	local_irq_enable();
+	preempt_enable();
+	return ret;
+}
+EXPORT_SYMBOL(on_each_cpu);
+#endif
diff --git a/lib/string.c b/lib/string.c
index 037a48acedb..b3c28a3f633 100644
--- a/lib/string.c
+++ b/lib/string.c
@@ -403,7 +403,6 @@ char *strpbrk(const char *cs, const char *ct)
 	}
 	return NULL;
 }
-EXPORT_SYMBOL(strpbrk);
 #endif
 
 #ifndef __HAVE_ARCH_STRSEP
diff --git a/mm/Kconfig b/mm/Kconfig
index a9cb80ae640..bd80460360d 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -137,5 +137,11 @@ config SPLIT_PTLOCK_CPUS
 # support for page migration
 #
 config MIGRATION
+	bool "Page migration"
 	def_bool y if NUMA || SPARSEMEM || DISCONTIGMEM
 	depends on SWAP
+	help
+	  Allows the migration of the physical location of pages of processes
+	  while the virtual addresses are not changed. This is useful for
+	  example on NUMA systems to put pages nearer to the processors accessing
+	  the page.
diff --git a/mm/Makefile b/mm/Makefile
index 9aa03fa1dcc..f10c753dce6 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -22,3 +22,5 @@ obj-$(CONFIG_SLOB) += slob.o
 obj-$(CONFIG_SLAB) += slab.o
 obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
 obj-$(CONFIG_FS_XIP) += filemap_xip.o
+obj-$(CONFIG_MIGRATION) += migrate.o
+
diff --git a/mm/filemap.c b/mm/filemap.c
index 44da3d47699..e8f58f7dd7a 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -30,6 +30,8 @@
 #include <linux/security.h>
 #include <linux/syscalls.h>
 #include "filemap.h"
+#include "internal.h"
+
 /*
  * FIXME: remove all knowledge of the buffer layer from the core VM
  */
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 508707704d2..ebad6bbb350 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -13,24 +13,48 @@
 #include <linux/pagemap.h>
 #include <linux/mempolicy.h>
 #include <linux/cpuset.h>
+#include <linux/mutex.h>
 
 #include <asm/page.h>
 #include <asm/pgtable.h>
 
 #include <linux/hugetlb.h>
+#include "internal.h"
 
 const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
-static unsigned long nr_huge_pages, free_huge_pages;
+static unsigned long nr_huge_pages, free_huge_pages, reserved_huge_pages;
 unsigned long max_huge_pages;
 static struct list_head hugepage_freelists[MAX_NUMNODES];
 static unsigned int nr_huge_pages_node[MAX_NUMNODES];
 static unsigned int free_huge_pages_node[MAX_NUMNODES];
-
 /*
  * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
  */
 static DEFINE_SPINLOCK(hugetlb_lock);
 
+static void clear_huge_page(struct page *page, unsigned long addr)
+{
+	int i;
+
+	might_sleep();
+	for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); i++) {
+		cond_resched();
+		clear_user_highpage(page + i, addr);
+	}
+}
+
+static void copy_huge_page(struct page *dst, struct page *src,
+			   unsigned long addr)
+{
+	int i;
+
+	might_sleep();
+	for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) {
+		cond_resched();
+		copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE);
+	}
+}
+
 static void enqueue_huge_page(struct page *page)
 {
 	int nid = page_to_nid(page);
@@ -64,57 +88,176 @@ static struct page *dequeue_huge_page(struct vm_area_struct *vma,
 	return page;
 }
 
-static struct page *alloc_fresh_huge_page(void)
+static void free_huge_page(struct page *page)
+{
+	BUG_ON(page_count(page));
+
+	INIT_LIST_HEAD(&page->lru);
+
+	spin_lock(&hugetlb_lock);
+	enqueue_huge_page(page);
+	spin_unlock(&hugetlb_lock);
+}
+
+static int alloc_fresh_huge_page(void)
 {
 	static int nid = 0;
 	struct page *page;
 	page = alloc_pages_node(nid, GFP_HIGHUSER|__GFP_COMP|__GFP_NOWARN,
 					HUGETLB_PAGE_ORDER);
-	nid = (nid + 1) % num_online_nodes();
+	nid = next_node(nid, node_online_map);
+	if (nid == MAX_NUMNODES)
+		nid = first_node(node_online_map);
 	if (page) {
+		page[1].lru.next = (void *)free_huge_page;	/* dtor */
 		spin_lock(&hugetlb_lock);
 		nr_huge_pages++;
 		nr_huge_pages_node[page_to_nid(page)]++;
 		spin_unlock(&hugetlb_lock);
+		put_page(page); /* free it into the hugepage allocator */
+		return 1;
 	}
-	return page;
+	return 0;
 }
 
-void free_huge_page(struct page *page)
+static struct page *alloc_huge_page(struct vm_area_struct *vma,
+				    unsigned long addr)
 {
-	BUG_ON(page_count(page));
+	struct inode *inode = vma->vm_file->f_dentry->d_inode;
+	struct page *page;
+	int use_reserve = 0;
+	unsigned long idx;
 
-	INIT_LIST_HEAD(&page->lru);
-	page[1].lru.next = NULL;			/* reset dtor */
+	spin_lock(&hugetlb_lock);
+
+	if (vma->vm_flags & VM_MAYSHARE) {
+
+		/* idx = radix tree index, i.e. offset into file in
+		 * HPAGE_SIZE units */
+		idx = ((addr - vma->vm_start) >> HPAGE_SHIFT)
+			+ (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
+
+		/* The hugetlbfs specific inode info stores the number
+		 * of "guaranteed available" (huge) pages.  That is,
+		 * the first 'prereserved_hpages' pages of the inode
+		 * are either already instantiated, or have been
+		 * pre-reserved (by hugetlb_reserve_for_inode()). Here
+		 * we're in the process of instantiating the page, so
+		 * we use this to determine whether to draw from the
+		 * pre-reserved pool or the truly free pool. */
+		if (idx < HUGETLBFS_I(inode)->prereserved_hpages)
+			use_reserve = 1;
+	}
+
+	if (!use_reserve) {
+		if (free_huge_pages <= reserved_huge_pages)
+			goto fail;
+	} else {
+		BUG_ON(reserved_huge_pages == 0);
+		reserved_huge_pages--;
+	}
+
+	page = dequeue_huge_page(vma, addr);
+	if (!page)
+		goto fail;
+
+	spin_unlock(&hugetlb_lock);
+	set_page_refcounted(page);
+	return page;
+
+ fail:
+	WARN_ON(use_reserve); /* reserved allocations shouldn't fail */
+	spin_unlock(&hugetlb_lock);
+	return NULL;
+}
+
+/* hugetlb_extend_reservation()
+ *
+ * Ensure that at least 'atleast' hugepages are, and will remain,
+ * available to instantiate the first 'atleast' pages of the given
+ * inode.  If the inode doesn't already have this many pages reserved
+ * or instantiated, set aside some hugepages in the reserved pool to
+ * satisfy later faults (or fail now if there aren't enough, rather
+ * than getting the SIGBUS later).
+ */
+int hugetlb_extend_reservation(struct hugetlbfs_inode_info *info,
+			       unsigned long atleast)
+{
+	struct inode *inode = &info->vfs_inode;
+	unsigned long change_in_reserve = 0;
+	int ret = 0;
 
 	spin_lock(&hugetlb_lock);
-	enqueue_huge_page(page);
+	read_lock_irq(&inode->i_mapping->tree_lock);
+
+	if (info->prereserved_hpages >= atleast)
+		goto out;
+
+	/* Because we always call this on shared mappings, none of the
+	 * pages beyond info->prereserved_hpages can have been
+	 * instantiated, so we need to reserve all of them now. */
+	change_in_reserve = atleast - info->prereserved_hpages;
+
+	if ((reserved_huge_pages + change_in_reserve) > free_huge_pages) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	reserved_huge_pages += change_in_reserve;
+	info->prereserved_hpages = atleast;
+
+ out:
+	read_unlock_irq(&inode->i_mapping->tree_lock);
 	spin_unlock(&hugetlb_lock);
+
+	return ret;
 }
 
-struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr)
+/* hugetlb_truncate_reservation()
+ *
+ * This returns pages reserved for the given inode to the general free
+ * hugepage pool.  If the inode has any pages prereserved, but not
+ * instantiated, beyond offset (atmost << HPAGE_SIZE), then release
+ * them.
+ */
+void hugetlb_truncate_reservation(struct hugetlbfs_inode_info *info,
+				  unsigned long atmost)
 {
+	struct inode *inode = &info->vfs_inode;
+	struct address_space *mapping = inode->i_mapping;
+	unsigned long idx;
+	unsigned long change_in_reserve = 0;
 	struct page *page;
-	int i;
 
 	spin_lock(&hugetlb_lock);
-	page = dequeue_huge_page(vma, addr);
-	if (!page) {
-		spin_unlock(&hugetlb_lock);
-		return NULL;
+	read_lock_irq(&inode->i_mapping->tree_lock);
+
+	if (info->prereserved_hpages <= atmost)
+		goto out;
+
+	/* Count pages which were reserved, but not instantiated, and
+	 * which we can now release. */
+	for (idx = atmost; idx < info->prereserved_hpages; idx++) {
+		page = radix_tree_lookup(&mapping->page_tree, idx);
+		if (!page)
+			/* Pages which are already instantiated can't
+			 * be unreserved (and in fact have already
+			 * been removed from the reserved pool) */
+			change_in_reserve++;
 	}
+
+	BUG_ON(reserved_huge_pages < change_in_reserve);
+	reserved_huge_pages -= change_in_reserve;
+	info->prereserved_hpages = atmost;
+
+ out:
+	read_unlock_irq(&inode->i_mapping->tree_lock);
 	spin_unlock(&hugetlb_lock);
-	set_page_count(page, 1);
-	page[1].lru.next = (void *)free_huge_page;	/* set dtor */
-	for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); ++i)
-		clear_user_highpage(&page[i], addr);
-	return page;
 }
 
 static int __init hugetlb_init(void)
 {
 	unsigned long i;
-	struct page *page;
 
 	if (HPAGE_SHIFT == 0)
 		return 0;
@@ -123,12 +266,8 @@ static int __init hugetlb_init(void)
 		INIT_LIST_HEAD(&hugepage_freelists[i]);
 
 	for (i = 0; i < max_huge_pages; ++i) {
-		page = alloc_fresh_huge_page();
-		if (!page)
+		if (!alloc_fresh_huge_page())
 			break;
-		spin_lock(&hugetlb_lock);
-		enqueue_huge_page(page);
-		spin_unlock(&hugetlb_lock);
 	}
 	max_huge_pages = free_huge_pages = nr_huge_pages = i;
 	printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages);
@@ -154,9 +293,9 @@ static void update_and_free_page(struct page *page)
 		page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
 				1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
 				1 << PG_private | 1<< PG_writeback);
-		set_page_count(&page[i], 0);
 	}
-	set_page_count(page, 1);
+	page[1].lru.next = NULL;
+	set_page_refcounted(page);
 	__free_pages(page, HUGETLB_PAGE_ORDER);
 }
 
@@ -188,12 +327,8 @@ static inline void try_to_free_low(unsigned long count)
 static unsigned long set_max_huge_pages(unsigned long count)
 {
 	while (count > nr_huge_pages) {
-		struct page *page = alloc_fresh_huge_page();
-		if (!page)
+		if (!alloc_fresh_huge_page())
 			return nr_huge_pages;
-		spin_lock(&hugetlb_lock);
-		enqueue_huge_page(page);
-		spin_unlock(&hugetlb_lock);
 	}
 	if (count >= nr_huge_pages)
 		return nr_huge_pages;
@@ -225,9 +360,11 @@ int hugetlb_report_meminfo(char *buf)
 	return sprintf(buf,
 			"HugePages_Total: %5lu\n"
 			"HugePages_Free:  %5lu\n"
+		        "HugePages_Rsvd:  %5lu\n"
 			"Hugepagesize:    %5lu kB\n",
 			nr_huge_pages,
 			free_huge_pages,
+		        reserved_huge_pages,
 			HPAGE_SIZE/1024);
 }
 
@@ -240,11 +377,6 @@ int hugetlb_report_node_meminfo(int nid, char *buf)
 		nid, free_huge_pages_node[nid]);
 }
 
-int is_hugepage_mem_enough(size_t size)
-{
-	return (size + ~HPAGE_MASK)/HPAGE_SIZE <= free_huge_pages;
-}
-
 /* Return the number pages of memory we physically have, in PAGE_SIZE units. */
 unsigned long hugetlb_total_pages(void)
 {
@@ -374,7 +506,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
 			unsigned long address, pte_t *ptep, pte_t pte)
 {
 	struct page *old_page, *new_page;
-	int i, avoidcopy;
+	int avoidcopy;
 
 	old_page = pte_page(pte);
 
@@ -395,9 +527,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
 	}
 
 	spin_unlock(&mm->page_table_lock);
-	for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++)
-		copy_user_highpage(new_page + i, old_page + i,
-				   address + i*PAGE_SIZE);
+	copy_huge_page(new_page, old_page, address);
 	spin_lock(&mm->page_table_lock);
 
 	ptep = huge_pte_offset(mm, address & HPAGE_MASK);
@@ -442,6 +572,7 @@ retry:
 			ret = VM_FAULT_OOM;
 			goto out;
 		}
+		clear_huge_page(page, address);
 
 		if (vma->vm_flags & VM_SHARED) {
 			int err;
@@ -496,14 +627,24 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	pte_t *ptep;
 	pte_t entry;
 	int ret;
+	static DEFINE_MUTEX(hugetlb_instantiation_mutex);
 
 	ptep = huge_pte_alloc(mm, address);
 	if (!ptep)
 		return VM_FAULT_OOM;
 
+	/*
+	 * Serialize hugepage allocation and instantiation, so that we don't
+	 * get spurious allocation failures if two CPUs race to instantiate
+	 * the same page in the page cache.
+	 */
+	mutex_lock(&hugetlb_instantiation_mutex);
 	entry = *ptep;
-	if (pte_none(entry))
-		return hugetlb_no_page(mm, vma, address, ptep, write_access);
+	if (pte_none(entry)) {
+		ret = hugetlb_no_page(mm, vma, address, ptep, write_access);
+		mutex_unlock(&hugetlb_instantiation_mutex);
+		return ret;
+	}
 
 	ret = VM_FAULT_MINOR;
 
@@ -513,6 +654,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 		if (write_access && !pte_write(entry))
 			ret = hugetlb_cow(mm, vma, address, ptep, entry);
 	spin_unlock(&mm->page_table_lock);
+	mutex_unlock(&hugetlb_instantiation_mutex);
 
 	return ret;
 }
@@ -521,10 +663,10 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 			struct page **pages, struct vm_area_struct **vmas,
 			unsigned long *position, int *length, int i)
 {
-	unsigned long vpfn, vaddr = *position;
+	unsigned long pfn_offset;
+	unsigned long vaddr = *position;
 	int remainder = *length;
 
-	vpfn = vaddr/PAGE_SIZE;
 	spin_lock(&mm->page_table_lock);
 	while (vaddr < vma->vm_end && remainder) {
 		pte_t *pte;
@@ -552,19 +694,28 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 			break;
 		}
 
-		if (pages) {
-			page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)];
-			get_page(page);
-			pages[i] = page;
-		}
+		pfn_offset = (vaddr & ~HPAGE_MASK) >> PAGE_SHIFT;
+		page = pte_page(*pte);
+same_page:
+		get_page(page);
+		if (pages)
+			pages[i] = page + pfn_offset;
 
 		if (vmas)
 			vmas[i] = vma;
 
 		vaddr += PAGE_SIZE;
-		++vpfn;
+		++pfn_offset;
 		--remainder;
 		++i;
+		if (vaddr < vma->vm_end && remainder &&
+				pfn_offset < HPAGE_SIZE/PAGE_SIZE) {
+			/*
+			 * We use pfn_offset to avoid touching the pageframes
+			 * of this compound page.
+			 */
+			goto same_page;
+		}
 	}
 	spin_unlock(&mm->page_table_lock);
 	*length = remainder;
@@ -572,3 +723,32 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 
 	return i;
 }
+
+void hugetlb_change_protection(struct vm_area_struct *vma,
+		unsigned long address, unsigned long end, pgprot_t newprot)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	unsigned long start = address;
+	pte_t *ptep;
+	pte_t pte;
+
+	BUG_ON(address >= end);
+	flush_cache_range(vma, address, end);
+
+	spin_lock(&mm->page_table_lock);
+	for (; address < end; address += HPAGE_SIZE) {
+		ptep = huge_pte_offset(mm, address);
+		if (!ptep)
+			continue;
+		if (!pte_none(*ptep)) {
+			pte = huge_ptep_get_and_clear(mm, address, ptep);
+			pte = pte_mkhuge(pte_modify(pte, newprot));
+			set_huge_pte_at(mm, address, ptep, pte);
+			lazy_mmu_prot_update(pte);
+		}
+	}
+	spin_unlock(&mm->page_table_lock);
+
+	flush_tlb_range(vma, start, end);
+}
+
diff --git a/mm/internal.h b/mm/internal.h
index 17256bb2f4e..d20e3cc4aef 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -8,23 +8,33 @@
  * as published by the Free Software Foundation; either version
  * 2 of the License, or (at your option) any later version.
  */
+#ifndef __MM_INTERNAL_H
+#define __MM_INTERNAL_H
 
-static inline void set_page_refs(struct page *page, int order)
+#include <linux/mm.h>
+
+static inline void set_page_count(struct page *page, int v)
+{
+	atomic_set(&page->_count, v);
+}
+
+/*
+ * Turn a non-refcounted page (->_count == 0) into refcounted with
+ * a count of one.
+ */
+static inline void set_page_refcounted(struct page *page)
 {
-#ifdef CONFIG_MMU
+	BUG_ON(PageCompound(page) && page_private(page) != (unsigned long)page);
+	BUG_ON(atomic_read(&page->_count));
 	set_page_count(page, 1);
-#else
-	int i;
+}
 
-	/*
-	 * We need to reference all the pages for this order, otherwise if
-	 * anyone accesses one of the pages with (get/put) it will be freed.
-	 * - eg: access_process_vm()
-	 */
-	for (i = 0; i < (1 << order); i++)
-		set_page_count(page + i, 1);
-#endif /* CONFIG_MMU */
+static inline void __put_page(struct page *page)
+{
+	atomic_dec(&page->_count);
 }
 
 extern void fastcall __init __free_pages_bootmem(struct page *page,
 						unsigned int order);
+
+#endif
diff --git a/mm/memory.c b/mm/memory.c
index 85e80a57db2..80c3fb370f9 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -277,7 +277,7 @@ void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma,
 		anon_vma_unlink(vma);
 		unlink_file_vma(vma);
 
-		if (is_hugepage_only_range(vma->vm_mm, addr, HPAGE_SIZE)) {
+		if (is_vm_hugetlb_page(vma)) {
 			hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
 				floor, next? next->vm_start: ceiling);
 		} else {
@@ -285,8 +285,7 @@ void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma,
 			 * Optimization: gather nearby vmas into one call down
 			 */
 			while (next && next->vm_start <= vma->vm_end + PMD_SIZE
-			  && !is_hugepage_only_range(vma->vm_mm, next->vm_start,
-							HPAGE_SIZE)) {
+			       && !is_vm_hugetlb_page(next)) {
 				vma = next;
 				next = vma->vm_next;
 				anon_vma_unlink(vma);
@@ -388,7 +387,7 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_
 {
 	unsigned long pfn = pte_pfn(pte);
 
-	if (vma->vm_flags & VM_PFNMAP) {
+	if (unlikely(vma->vm_flags & VM_PFNMAP)) {
 		unsigned long off = (addr - vma->vm_start) >> PAGE_SHIFT;
 		if (pfn == vma->vm_pgoff + off)
 			return NULL;
@@ -396,18 +395,12 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_
 			return NULL;
 	}
 
-	/*
-	 * Add some anal sanity checks for now. Eventually,
-	 * we should just do "return pfn_to_page(pfn)", but
-	 * in the meantime we check that we get a valid pfn,
-	 * and that the resulting page looks ok.
-	 *
-	 * Remove this test eventually!
-	 */
+#ifdef CONFIG_DEBUG_VM
 	if (unlikely(!pfn_valid(pfn))) {
 		print_bad_pte(vma, pte, addr);
 		return NULL;
 	}
+#endif
 
 	/*
 	 * NOTE! We still have PageReserved() pages in the page 
@@ -1221,9 +1214,7 @@ out:
  * The page has to be a nice clean _individual_ kernel allocation.
  * If you allocate a compound page, you need to have marked it as
  * such (__GFP_COMP), or manually just split the page up yourself
- * (which is mainly an issue of doing "set_page_count(page, 1)" for
- * each sub-page, and then freeing them one by one when you free
- * them rather than freeing it as a compound page).
+ * (see split_page()).
  *
  * NOTE! Traditionally this was done with "remap_pfn_range()" which
  * took an arbitrary page protection parameter. This doesn't allow
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index b21869a39f0..e93cc740c22 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -86,6 +86,7 @@
 #include <linux/swap.h>
 #include <linux/seq_file.h>
 #include <linux/proc_fs.h>
+#include <linux/migrate.h>
 
 #include <asm/tlbflush.h>
 #include <asm/uaccess.h>
@@ -95,11 +96,8 @@
 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)		/* Invert check for nodemask */
 #define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2)		/* Gather statistics */
 
-/* The number of pages to migrate per call to migrate_pages() */
-#define MIGRATE_CHUNK_SIZE 256
-
-static kmem_cache_t *policy_cache;
-static kmem_cache_t *sn_cache;
+static struct kmem_cache *policy_cache;
+static struct kmem_cache *sn_cache;
 
 #define PDprintk(fmt...)
 
@@ -331,17 +329,10 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 	struct vm_area_struct *first, *vma, *prev;
 
 	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
-		/* Must have swap device for migration */
-		if (nr_swap_pages <= 0)
-			return ERR_PTR(-ENODEV);
 
-		/*
-		 * Clear the LRU lists so pages can be isolated.
-		 * Note that pages may be moved off the LRU after we have
-		 * drained them. Those pages will fail to migrate like other
-		 * pages that may be busy.
-		 */
-		lru_add_drain_all();
+		err = migrate_prep();
+		if (err)
+			return ERR_PTR(err);
 	}
 
 	first = find_vma(mm, start);
@@ -550,92 +541,18 @@ long do_get_mempolicy(int *policy, nodemask_t *nmask,
 	return err;
 }
 
+#ifdef CONFIG_MIGRATION
 /*
  * page migration
  */
-
 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 				unsigned long flags)
 {
 	/*
 	 * Avoid migrating a page that is shared with others.
 	 */
-	if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
-		if (isolate_lru_page(page))
-			list_add_tail(&page->lru, pagelist);
-	}
-}
-
-/*
- * Migrate the list 'pagelist' of pages to a certain destination.
- *
- * Specify destination with either non-NULL vma or dest_node >= 0
- * Return the number of pages not migrated or error code
- */
-static int migrate_pages_to(struct list_head *pagelist,
-			struct vm_area_struct *vma, int dest)
-{
-	LIST_HEAD(newlist);
-	LIST_HEAD(moved);
-	LIST_HEAD(failed);
-	int err = 0;
-	unsigned long offset = 0;
-	int nr_pages;
-	struct page *page;
-	struct list_head *p;
-
-redo:
-	nr_pages = 0;
-	list_for_each(p, pagelist) {
-		if (vma) {
-			/*
-			 * The address passed to alloc_page_vma is used to
-			 * generate the proper interleave behavior. We fake
-			 * the address here by an increasing offset in order
-			 * to get the proper distribution of pages.
-			 *
-			 * No decision has been made as to which page
-			 * a certain old page is moved to so we cannot
-			 * specify the correct address.
-			 */
-			page = alloc_page_vma(GFP_HIGHUSER, vma,
-					offset + vma->vm_start);
-			offset += PAGE_SIZE;
-		}
-		else
-			page = alloc_pages_node(dest, GFP_HIGHUSER, 0);
-
-		if (!page) {
-			err = -ENOMEM;
-			goto out;
-		}
-		list_add_tail(&page->lru, &newlist);
-		nr_pages++;
-		if (nr_pages > MIGRATE_CHUNK_SIZE)
-			break;
-	}
-	err = migrate_pages(pagelist, &newlist, &moved, &failed);
-
-	putback_lru_pages(&moved);	/* Call release pages instead ?? */
-
-	if (err >= 0 && list_empty(&newlist) && !list_empty(pagelist))
-		goto redo;
-out:
-	/* Return leftover allocated pages */
-	while (!list_empty(&newlist)) {
-		page = list_entry(newlist.next, struct page, lru);
-		list_del(&page->lru);
-		__free_page(page);
-	}
-	list_splice(&failed, pagelist);
-	if (err < 0)
-		return err;
-
-	/* Calculate number of leftover pages */
-	nr_pages = 0;
-	list_for_each(p, pagelist)
-		nr_pages++;
-	return nr_pages;
+	if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1)
+		isolate_lru_page(page, pagelist);
 }
 
 /*
@@ -742,8 +659,23 @@ int do_migrate_pages(struct mm_struct *mm,
 	if (err < 0)
 		return err;
 	return busy;
+
 }
 
+#else
+
+static void migrate_page_add(struct page *page, struct list_head *pagelist,
+				unsigned long flags)
+{
+}
+
+int do_migrate_pages(struct mm_struct *mm,
+	const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
+{
+	return -ENOSYS;
+}
+#endif
+
 long do_mbind(unsigned long start, unsigned long len,
 		unsigned long mode, nodemask_t *nmask, unsigned long flags)
 {
@@ -808,6 +740,7 @@ long do_mbind(unsigned long start, unsigned long len,
 		if (!err && nr_failed && (flags & MPOL_MF_STRICT))
 			err = -EIO;
 	}
+
 	if (!list_empty(&pagelist))
 		putback_lru_pages(&pagelist);
 
diff --git a/mm/mempool.c b/mm/mempool.c
index 1a99b80480d..f71893ed354 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -278,14 +278,14 @@ EXPORT_SYMBOL(mempool_free);
  */
 void *mempool_alloc_slab(gfp_t gfp_mask, void *pool_data)
 {
-	kmem_cache_t *mem = (kmem_cache_t *) pool_data;
+	struct kmem_cache *mem = pool_data;
 	return kmem_cache_alloc(mem, gfp_mask);
 }
 EXPORT_SYMBOL(mempool_alloc_slab);
 
 void mempool_free_slab(void *element, void *pool_data)
 {
-	kmem_cache_t *mem = (kmem_cache_t *) pool_data;
+	struct kmem_cache *mem = pool_data;
 	kmem_cache_free(mem, element);
 }
 EXPORT_SYMBOL(mempool_free_slab);
diff --git a/mm/migrate.c b/mm/migrate.c
new file mode 100644
index 00000000000..09f6e4aa87f
--- /dev/null
+++ b/mm/migrate.c
@@ -0,0 +1,655 @@
+/*
+ * Memory Migration functionality - linux/mm/migration.c
+ *
+ * Copyright (C) 2006 Silicon Graphics, Inc., Christoph Lameter
+ *
+ * Page migration was first developed in the context of the memory hotplug
+ * project. The main authors of the migration code are:
+ *
+ * IWAMOTO Toshihiro <iwamoto@valinux.co.jp>
+ * Hirokazu Takahashi <taka@valinux.co.jp>
+ * Dave Hansen <haveblue@us.ibm.com>
+ * Christoph Lameter <clameter@sgi.com>
+ */
+
+#include <linux/migrate.h>
+#include <linux/module.h>
+#include <linux/swap.h>
+#include <linux/pagemap.h>
+#include <linux/buffer_head.h>	/* for try_to_release_page(),
+					buffer_heads_over_limit */
+#include <linux/mm_inline.h>
+#include <linux/pagevec.h>
+#include <linux/rmap.h>
+#include <linux/topology.h>
+#include <linux/cpu.h>
+#include <linux/cpuset.h>
+#include <linux/swapops.h>
+
+#include "internal.h"
+
+#include "internal.h"
+
+/* The maximum number of pages to take off the LRU for migration */
+#define MIGRATE_CHUNK_SIZE 256
+
+#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
+
+/*
+ * Isolate one page from the LRU lists. If successful put it onto
+ * the indicated list with elevated page count.
+ *
+ * Result:
+ *  -EBUSY: page not on LRU list
+ *  0: page removed from LRU list and added to the specified list.
+ */
+int isolate_lru_page(struct page *page, struct list_head *pagelist)
+{
+	int ret = -EBUSY;
+
+	if (PageLRU(page)) {
+		struct zone *zone = page_zone(page);
+
+		spin_lock_irq(&zone->lru_lock);
+		if (PageLRU(page)) {
+			ret = 0;
+			get_page(page);
+			ClearPageLRU(page);
+			if (PageActive(page))
+				del_page_from_active_list(zone, page);
+			else
+				del_page_from_inactive_list(zone, page);
+			list_add_tail(&page->lru, pagelist);
+		}
+		spin_unlock_irq(&zone->lru_lock);
+	}
+	return ret;
+}
+
+/*
+ * migrate_prep() needs to be called after we have compiled the list of pages
+ * to be migrated using isolate_lru_page() but before we begin a series of calls
+ * to migrate_pages().
+ */
+int migrate_prep(void)
+{
+	/* Must have swap device for migration */
+	if (nr_swap_pages <= 0)
+		return -ENODEV;
+
+	/*
+	 * Clear the LRU lists so pages can be isolated.
+	 * Note that pages may be moved off the LRU after we have
+	 * drained them. Those pages will fail to migrate like other
+	 * pages that may be busy.
+	 */
+	lru_add_drain_all();
+
+	return 0;
+}
+
+static inline void move_to_lru(struct page *page)
+{
+	list_del(&page->lru);
+	if (PageActive(page)) {
+		/*
+		 * lru_cache_add_active checks that
+		 * the PG_active bit is off.
+		 */
+		ClearPageActive(page);
+		lru_cache_add_active(page);
+	} else {
+		lru_cache_add(page);
+	}
+	put_page(page);
+}
+
+/*
+ * Add isolated pages on the list back to the LRU.
+ *
+ * returns the number of pages put back.
+ */
+int putback_lru_pages(struct list_head *l)
+{
+	struct page *page;
+	struct page *page2;
+	int count = 0;
+
+	list_for_each_entry_safe(page, page2, l, lru) {
+		move_to_lru(page);
+		count++;
+	}
+	return count;
+}
+
+/*
+ * Non migratable page
+ */
+int fail_migrate_page(struct page *newpage, struct page *page)
+{
+	return -EIO;
+}
+EXPORT_SYMBOL(fail_migrate_page);
+
+/*
+ * swapout a single page
+ * page is locked upon entry, unlocked on exit
+ */
+static int swap_page(struct page *page)
+{
+	struct address_space *mapping = page_mapping(page);
+
+	if (page_mapped(page) && mapping)
+		if (try_to_unmap(page, 1) != SWAP_SUCCESS)
+			goto unlock_retry;
+
+	if (PageDirty(page)) {
+		/* Page is dirty, try to write it out here */
+		switch(pageout(page, mapping)) {
+		case PAGE_KEEP:
+		case PAGE_ACTIVATE:
+			goto unlock_retry;
+
+		case PAGE_SUCCESS:
+			goto retry;
+
+		case PAGE_CLEAN:
+			; /* try to free the page below */
+		}
+	}
+
+	if (PagePrivate(page)) {
+		if (!try_to_release_page(page, GFP_KERNEL) ||
+		    (!mapping && page_count(page) == 1))
+			goto unlock_retry;
+	}
+
+	if (remove_mapping(mapping, page)) {
+		/* Success */
+		unlock_page(page);
+		return 0;
+	}
+
+unlock_retry:
+	unlock_page(page);
+
+retry:
+	return -EAGAIN;
+}
+EXPORT_SYMBOL(swap_page);
+
+/*
+ * Remove references for a page and establish the new page with the correct
+ * basic settings to be able to stop accesses to the page.
+ */
+int migrate_page_remove_references(struct page *newpage,
+				struct page *page, int nr_refs)
+{
+	struct address_space *mapping = page_mapping(page);
+	struct page **radix_pointer;
+
+	/*
+	 * Avoid doing any of the following work if the page count
+	 * indicates that the page is in use or truncate has removed
+	 * the page.
+	 */
+	if (!mapping || page_mapcount(page) + nr_refs != page_count(page))
+		return -EAGAIN;
+
+	/*
+	 * Establish swap ptes for anonymous pages or destroy pte
+	 * maps for files.
+	 *
+	 * In order to reestablish file backed mappings the fault handlers
+	 * will take the radix tree_lock which may then be used to stop
+  	 * processses from accessing this page until the new page is ready.
+	 *
+	 * A process accessing via a swap pte (an anonymous page) will take a
+	 * page_lock on the old page which will block the process until the
+	 * migration attempt is complete. At that time the PageSwapCache bit
+	 * will be examined. If the page was migrated then the PageSwapCache
+	 * bit will be clear and the operation to retrieve the page will be
+	 * retried which will find the new page in the radix tree. Then a new
+	 * direct mapping may be generated based on the radix tree contents.
+	 *
+	 * If the page was not migrated then the PageSwapCache bit
+	 * is still set and the operation may continue.
+	 */
+	if (try_to_unmap(page, 1) == SWAP_FAIL)
+		/* A vma has VM_LOCKED set -> permanent failure */
+		return -EPERM;
+
+	/*
+	 * Give up if we were unable to remove all mappings.
+	 */
+	if (page_mapcount(page))
+		return -EAGAIN;
+
+	write_lock_irq(&mapping->tree_lock);
+
+	radix_pointer = (struct page **)radix_tree_lookup_slot(
+						&mapping->page_tree,
+						page_index(page));
+
+	if (!page_mapping(page) || page_count(page) != nr_refs ||
+			*radix_pointer != page) {
+		write_unlock_irq(&mapping->tree_lock);
+		return 1;
+	}
+
+	/*
+	 * Now we know that no one else is looking at the page.
+	 *
+	 * Certain minimal information about a page must be available
+	 * in order for other subsystems to properly handle the page if they
+	 * find it through the radix tree update before we are finished
+	 * copying the page.
+	 */
+	get_page(newpage);
+	newpage->index = page->index;
+	newpage->mapping = page->mapping;
+	if (PageSwapCache(page)) {
+		SetPageSwapCache(newpage);
+		set_page_private(newpage, page_private(page));
+	}
+
+	*radix_pointer = newpage;
+	__put_page(page);
+	write_unlock_irq(&mapping->tree_lock);
+
+	return 0;
+}
+EXPORT_SYMBOL(migrate_page_remove_references);
+
+/*
+ * Copy the page to its new location
+ */
+void migrate_page_copy(struct page *newpage, struct page *page)
+{
+	copy_highpage(newpage, page);
+
+	if (PageError(page))
+		SetPageError(newpage);
+	if (PageReferenced(page))
+		SetPageReferenced(newpage);
+	if (PageUptodate(page))
+		SetPageUptodate(newpage);
+	if (PageActive(page))
+		SetPageActive(newpage);
+	if (PageChecked(page))
+		SetPageChecked(newpage);
+	if (PageMappedToDisk(page))
+		SetPageMappedToDisk(newpage);
+
+	if (PageDirty(page)) {
+		clear_page_dirty_for_io(page);
+		set_page_dirty(newpage);
+ 	}
+
+	ClearPageSwapCache(page);
+	ClearPageActive(page);
+	ClearPagePrivate(page);
+	set_page_private(page, 0);
+	page->mapping = NULL;
+
+	/*
+	 * If any waiters have accumulated on the new page then
+	 * wake them up.
+	 */
+	if (PageWriteback(newpage))
+		end_page_writeback(newpage);
+}
+EXPORT_SYMBOL(migrate_page_copy);
+
+/*
+ * Common logic to directly migrate a single page suitable for
+ * pages that do not use PagePrivate.
+ *
+ * Pages are locked upon entry and exit.
+ */
+int migrate_page(struct page *newpage, struct page *page)
+{
+	int rc;
+
+	BUG_ON(PageWriteback(page));	/* Writeback must be complete */
+
+	rc = migrate_page_remove_references(newpage, page, 2);
+
+	if (rc)
+		return rc;
+
+	migrate_page_copy(newpage, page);
+
+	/*
+	 * Remove auxiliary swap entries and replace
+	 * them with real ptes.
+	 *
+	 * Note that a real pte entry will allow processes that are not
+	 * waiting on the page lock to use the new page via the page tables
+	 * before the new page is unlocked.
+	 */
+	remove_from_swap(newpage);
+	return 0;
+}
+EXPORT_SYMBOL(migrate_page);
+
+/*
+ * migrate_pages
+ *
+ * Two lists are passed to this function. The first list
+ * contains the pages isolated from the LRU to be migrated.
+ * The second list contains new pages that the pages isolated
+ * can be moved to. If the second list is NULL then all
+ * pages are swapped out.
+ *
+ * The function returns after 10 attempts or if no pages
+ * are movable anymore because to has become empty
+ * or no retryable pages exist anymore.
+ *
+ * Return: Number of pages not migrated when "to" ran empty.
+ */
+int migrate_pages(struct list_head *from, struct list_head *to,
+		  struct list_head *moved, struct list_head *failed)
+{
+	int retry;
+	int nr_failed = 0;
+	int pass = 0;
+	struct page *page;
+	struct page *page2;
+	int swapwrite = current->flags & PF_SWAPWRITE;
+	int rc;
+
+	if (!swapwrite)
+		current->flags |= PF_SWAPWRITE;
+
+redo:
+	retry = 0;
+
+	list_for_each_entry_safe(page, page2, from, lru) {
+		struct page *newpage = NULL;
+		struct address_space *mapping;
+
+		cond_resched();
+
+		rc = 0;
+		if (page_count(page) == 1)
+			/* page was freed from under us. So we are done. */
+			goto next;
+
+		if (to && list_empty(to))
+			break;
+
+		/*
+		 * Skip locked pages during the first two passes to give the
+		 * functions holding the lock time to release the page. Later we
+		 * use lock_page() to have a higher chance of acquiring the
+		 * lock.
+		 */
+		rc = -EAGAIN;
+		if (pass > 2)
+			lock_page(page);
+		else
+			if (TestSetPageLocked(page))
+				goto next;
+
+		/*
+		 * Only wait on writeback if we have already done a pass where
+		 * we we may have triggered writeouts for lots of pages.
+		 */
+		if (pass > 0) {
+			wait_on_page_writeback(page);
+		} else {
+			if (PageWriteback(page))
+				goto unlock_page;
+		}
+
+		/*
+		 * Anonymous pages must have swap cache references otherwise
+		 * the information contained in the page maps cannot be
+		 * preserved.
+		 */
+		if (PageAnon(page) && !PageSwapCache(page)) {
+			if (!add_to_swap(page, GFP_KERNEL)) {
+				rc = -ENOMEM;
+				goto unlock_page;
+			}
+		}
+
+		if (!to) {
+			rc = swap_page(page);
+			goto next;
+		}
+
+		newpage = lru_to_page(to);
+		lock_page(newpage);
+
+		/*
+		 * Pages are properly locked and writeback is complete.
+		 * Try to migrate the page.
+		 */
+		mapping = page_mapping(page);
+		if (!mapping)
+			goto unlock_both;
+
+		if (mapping->a_ops->migratepage) {
+			/*
+			 * Most pages have a mapping and most filesystems
+			 * should provide a migration function. Anonymous
+			 * pages are part of swap space which also has its
+			 * own migration function. This is the most common
+			 * path for page migration.
+			 */
+			rc = mapping->a_ops->migratepage(newpage, page);
+			goto unlock_both;
+                }
+
+		/*
+		 * Default handling if a filesystem does not provide
+		 * a migration function. We can only migrate clean
+		 * pages so try to write out any dirty pages first.
+		 */
+		if (PageDirty(page)) {
+			switch (pageout(page, mapping)) {
+			case PAGE_KEEP:
+			case PAGE_ACTIVATE:
+				goto unlock_both;
+
+			case PAGE_SUCCESS:
+				unlock_page(newpage);
+				goto next;
+
+			case PAGE_CLEAN:
+				; /* try to migrate the page below */
+			}
+                }
+
+		/*
+		 * Buffers are managed in a filesystem specific way.
+		 * We must have no buffers or drop them.
+		 */
+		if (!page_has_buffers(page) ||
+		    try_to_release_page(page, GFP_KERNEL)) {
+			rc = migrate_page(newpage, page);
+			goto unlock_both;
+		}
+
+		/*
+		 * On early passes with mapped pages simply
+		 * retry. There may be a lock held for some
+		 * buffers that may go away. Later
+		 * swap them out.
+		 */
+		if (pass > 4) {
+			/*
+			 * Persistently unable to drop buffers..... As a
+			 * measure of last resort we fall back to
+			 * swap_page().
+			 */
+			unlock_page(newpage);
+			newpage = NULL;
+			rc = swap_page(page);
+			goto next;
+		}
+
+unlock_both:
+		unlock_page(newpage);
+
+unlock_page:
+		unlock_page(page);
+
+next:
+		if (rc == -EAGAIN) {
+			retry++;
+		} else if (rc) {
+			/* Permanent failure */
+			list_move(&page->lru, failed);
+			nr_failed++;
+		} else {
+			if (newpage) {
+				/* Successful migration. Return page to LRU */
+				move_to_lru(newpage);
+			}
+			list_move(&page->lru, moved);
+		}
+	}
+	if (retry && pass++ < 10)
+		goto redo;
+
+	if (!swapwrite)
+		current->flags &= ~PF_SWAPWRITE;
+
+	return nr_failed + retry;
+}
+
+/*
+ * Migration function for pages with buffers. This function can only be used
+ * if the underlying filesystem guarantees that no other references to "page"
+ * exist.
+ */
+int buffer_migrate_page(struct page *newpage, struct page *page)
+{
+	struct address_space *mapping = page->mapping;
+	struct buffer_head *bh, *head;
+	int rc;
+
+	if (!mapping)
+		return -EAGAIN;
+
+	if (!page_has_buffers(page))
+		return migrate_page(newpage, page);
+
+	head = page_buffers(page);
+
+	rc = migrate_page_remove_references(newpage, page, 3);
+
+	if (rc)
+		return rc;
+
+	bh = head;
+	do {
+		get_bh(bh);
+		lock_buffer(bh);
+		bh = bh->b_this_page;
+
+	} while (bh != head);
+
+	ClearPagePrivate(page);
+	set_page_private(newpage, page_private(page));
+	set_page_private(page, 0);
+	put_page(page);
+	get_page(newpage);
+
+	bh = head;
+	do {
+		set_bh_page(bh, newpage, bh_offset(bh));
+		bh = bh->b_this_page;
+
+	} while (bh != head);
+
+	SetPagePrivate(newpage);
+
+	migrate_page_copy(newpage, page);
+
+	bh = head;
+	do {
+		unlock_buffer(bh);
+ 		put_bh(bh);
+		bh = bh->b_this_page;
+
+	} while (bh != head);
+
+	return 0;
+}
+EXPORT_SYMBOL(buffer_migrate_page);
+
+/*
+ * Migrate the list 'pagelist' of pages to a certain destination.
+ *
+ * Specify destination with either non-NULL vma or dest_node >= 0
+ * Return the number of pages not migrated or error code
+ */
+int migrate_pages_to(struct list_head *pagelist,
+			struct vm_area_struct *vma, int dest)
+{
+	LIST_HEAD(newlist);
+	LIST_HEAD(moved);
+	LIST_HEAD(failed);
+	int err = 0;
+	unsigned long offset = 0;
+	int nr_pages;
+	struct page *page;
+	struct list_head *p;
+
+redo:
+	nr_pages = 0;
+	list_for_each(p, pagelist) {
+		if (vma) {
+			/*
+			 * The address passed to alloc_page_vma is used to
+			 * generate the proper interleave behavior. We fake
+			 * the address here by an increasing offset in order
+			 * to get the proper distribution of pages.
+			 *
+			 * No decision has been made as to which page
+			 * a certain old page is moved to so we cannot
+			 * specify the correct address.
+			 */
+			page = alloc_page_vma(GFP_HIGHUSER, vma,
+					offset + vma->vm_start);
+			offset += PAGE_SIZE;
+		}
+		else
+			page = alloc_pages_node(dest, GFP_HIGHUSER, 0);
+
+		if (!page) {
+			err = -ENOMEM;
+			goto out;
+		}
+		list_add_tail(&page->lru, &newlist);
+		nr_pages++;
+		if (nr_pages > MIGRATE_CHUNK_SIZE)
+			break;
+	}
+	err = migrate_pages(pagelist, &newlist, &moved, &failed);
+
+	putback_lru_pages(&moved);	/* Call release pages instead ?? */
+
+	if (err >= 0 && list_empty(&newlist) && !list_empty(pagelist))
+		goto redo;
+out:
+	/* Return leftover allocated pages */
+	while (!list_empty(&newlist)) {
+		page = list_entry(newlist.next, struct page, lru);
+		list_del(&page->lru);
+		__free_page(page);
+	}
+	list_splice(&failed, pagelist);
+	if (err < 0)
+		return err;
+
+	/* Calculate number of leftover pages */
+	nr_pages = 0;
+	list_for_each(p, pagelist)
+		nr_pages++;
+	return nr_pages;
+}
diff --git a/mm/mmap.c b/mm/mmap.c
index 47556d2b3e9..0eb9894db6d 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -612,7 +612,7 @@ again:			remove_next = 1 + (end > next->vm_end);
  * If the vma has a ->close operation then the driver probably needs to release
  * per-vma resources, so we don't attempt to merge those.
  */
-#define VM_SPECIAL (VM_IO | VM_DONTCOPY | VM_DONTEXPAND | VM_RESERVED | VM_PFNMAP)
+#define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_RESERVED | VM_PFNMAP)
 
 static inline int is_mergeable_vma(struct vm_area_struct *vma,
 			struct file *file, unsigned long vm_flags)
@@ -845,14 +845,6 @@ void vm_stat_account(struct mm_struct *mm, unsigned long flags,
 	const unsigned long stack_flags
 		= VM_STACK_FLAGS & (VM_GROWSUP|VM_GROWSDOWN);
 
-#ifdef CONFIG_HUGETLB
-	if (flags & VM_HUGETLB) {
-		if (!(flags & VM_DONTCOPY))
-			mm->shared_vm += pages;
-		return;
-	}
-#endif /* CONFIG_HUGETLB */
-
 	if (file) {
 		mm->shared_vm += pages;
 		if ((flags & (VM_EXEC|VM_WRITE)) == VM_EXEC)
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 653b8571c1e..4c14d4289b6 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -124,7 +124,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
 	 * a MAP_NORESERVE private mapping to writable will now reserve.
 	 */
 	if (newflags & VM_WRITE) {
-		if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_SHARED|VM_HUGETLB))) {
+		if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_SHARED))) {
 			charged = nrpages;
 			if (security_vm_enough_memory(charged))
 				return -ENOMEM;
@@ -166,7 +166,10 @@ success:
 	 */
 	vma->vm_flags = newflags;
 	vma->vm_page_prot = newprot;
-	change_protection(vma, start, end, newprot);
+	if (is_vm_hugetlb_page(vma))
+		hugetlb_change_protection(vma, start, end, newprot);
+	else
+		change_protection(vma, start, end, newprot);
 	vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
 	vm_stat_account(mm, newflags, vma->vm_file, nrpages);
 	return 0;
@@ -240,11 +243,6 @@ sys_mprotect(unsigned long start, size_t len, unsigned long prot)
 
 		/* Here we know that  vma->vm_start <= nstart < vma->vm_end. */
 
-		if (is_vm_hugetlb_page(vma)) {
-			error = -EACCES;
-			goto out;
-		}
-
 		newflags = vm_flags | (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC));
 
 		/* newflags >> 4 shift VM_MAY% in place of VM_% */
diff --git a/mm/nommu.c b/mm/nommu.c
index 4951f4786f2..db45efac17c 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -159,7 +159,7 @@ void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
 	/*
 	 * kmalloc doesn't like __GFP_HIGHMEM for some reason
 	 */
-	return kmalloc(size, gfp_mask & ~__GFP_HIGHMEM);
+	return kmalloc(size, (gfp_mask | __GFP_COMP) & ~__GFP_HIGHMEM);
 }
 
 struct page * vmalloc_to_page(void *addr)
@@ -623,7 +623,7 @@ static int do_mmap_private(struct vm_area_struct *vma, unsigned long len)
 	 * - note that this may not return a page-aligned address if the object
 	 *   we're allocating is smaller than a page
 	 */
-	base = kmalloc(len, GFP_KERNEL);
+	base = kmalloc(len, GFP_KERNEL|__GFP_COMP);
 	if (!base)
 		goto enomem;
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 234bd4895d1..b7f14a4799a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -55,7 +55,6 @@ unsigned long totalhigh_pages __read_mostly;
 long nr_swap_pages;
 int percpu_pagelist_fraction;
 
-static void fastcall free_hot_cold_page(struct page *page, int cold);
 static void __free_pages_ok(struct page *page, unsigned int order);
 
 /*
@@ -190,7 +189,7 @@ static void prep_compound_page(struct page *page, unsigned long order)
 	for (i = 0; i < nr_pages; i++) {
 		struct page *p = page + i;
 
-		SetPageCompound(p);
+		__SetPageCompound(p);
 		set_page_private(p, (unsigned long)page);
 	}
 }
@@ -209,10 +208,24 @@ static void destroy_compound_page(struct page *page, unsigned long order)
 		if (unlikely(!PageCompound(p) |
 				(page_private(p) != (unsigned long)page)))
 			bad_page(page);
-		ClearPageCompound(p);
+		__ClearPageCompound(p);
 	}
 }
 
+static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
+{
+	int i;
+
+	BUG_ON((gfp_flags & (__GFP_WAIT | __GFP_HIGHMEM)) == __GFP_HIGHMEM);
+	/*
+	 * clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO
+	 * and __GFP_HIGHMEM from hard or soft interrupt context.
+	 */
+	BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt());
+	for (i = 0; i < (1 << order); i++)
+		clear_highpage(page + i);
+}
+
 /*
  * function for dealing with page's order in buddy system.
  * zone->lock is already acquired when we use these.
@@ -423,11 +436,6 @@ static void __free_pages_ok(struct page *page, unsigned int order)
 		mutex_debug_check_no_locks_freed(page_address(page),
 						 PAGE_SIZE<<order);
 
-#ifndef CONFIG_MMU
-	for (i = 1 ; i < (1 << order) ; ++i)
-		__put_page(page + i);
-#endif
-
 	for (i = 0 ; i < (1 << order) ; ++i)
 		reserved += free_pages_check(page + i);
 	if (reserved)
@@ -448,28 +456,23 @@ void fastcall __init __free_pages_bootmem(struct page *page, unsigned int order)
 	if (order == 0) {
 		__ClearPageReserved(page);
 		set_page_count(page, 0);
-
-		free_hot_cold_page(page, 0);
+		set_page_refcounted(page);
+		__free_page(page);
 	} else {
-		LIST_HEAD(list);
 		int loop;
 
+		prefetchw(page);
 		for (loop = 0; loop < BITS_PER_LONG; loop++) {
 			struct page *p = &page[loop];
 
-			if (loop + 16 < BITS_PER_LONG)
-				prefetchw(p + 16);
+			if (loop + 1 < BITS_PER_LONG)
+				prefetchw(p + 1);
 			__ClearPageReserved(p);
 			set_page_count(p, 0);
 		}
 
-		arch_free_page(page, order);
-
-		mod_page_state(pgfree, 1 << order);
-
-		list_add(&page->lru, &list);
-		kernel_map_pages(page, 1 << order, 0);
-		free_pages_bulk(page_zone(page), 1, &list, order);
+		set_page_refcounted(page);
+		__free_pages(page, order);
 	}
 }
 
@@ -507,7 +510,7 @@ static inline void expand(struct zone *zone, struct page *page,
 /*
  * This page is about to be returned from the page allocator
  */
-static int prep_new_page(struct page *page, int order)
+static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
 {
 	if (unlikely(page_mapcount(page) |
 		(page->mapping != NULL)  |
@@ -536,8 +539,15 @@ static int prep_new_page(struct page *page, int order)
 			1 << PG_referenced | 1 << PG_arch_1 |
 			1 << PG_checked | 1 << PG_mappedtodisk);
 	set_page_private(page, 0);
-	set_page_refs(page, order);
+	set_page_refcounted(page);
 	kernel_map_pages(page, 1 << order, 1);
+
+	if (gfp_flags & __GFP_ZERO)
+		prep_zero_page(page, order, gfp_flags);
+
+	if (order && (gfp_flags & __GFP_COMP))
+		prep_compound_page(page, order);
+
 	return 0;
 }
 
@@ -593,13 +603,14 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
 /*
  * Called from the slab reaper to drain pagesets on a particular node that
  * belong to the currently executing processor.
+ * Note that this function must be called with the thread pinned to
+ * a single processor.
  */
 void drain_node_pages(int nodeid)
 {
 	int i, z;
 	unsigned long flags;
 
-	local_irq_save(flags);
 	for (z = 0; z < MAX_NR_ZONES; z++) {
 		struct zone *zone = NODE_DATA(nodeid)->node_zones + z;
 		struct per_cpu_pageset *pset;
@@ -609,11 +620,14 @@ void drain_node_pages(int nodeid)
 			struct per_cpu_pages *pcp;
 
 			pcp = &pset->pcp[i];
-			free_pages_bulk(zone, pcp->count, &pcp->list, 0);
-			pcp->count = 0;
+			if (pcp->count) {
+				local_irq_save(flags);
+				free_pages_bulk(zone, pcp->count, &pcp->list, 0);
+				pcp->count = 0;
+				local_irq_restore(flags);
+			}
 		}
 	}
-	local_irq_restore(flags);
 }
 #endif
 
@@ -743,13 +757,22 @@ void fastcall free_cold_page(struct page *page)
 	free_hot_cold_page(page, 1);
 }
 
-static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
+/*
+ * split_page takes a non-compound higher-order page, and splits it into
+ * n (1<<order) sub-pages: page[0..n]
+ * Each sub-page must be freed individually.
+ *
+ * Note: this is probably too low level an operation for use in drivers.
+ * Please consult with lkml before using this in your driver.
+ */
+void split_page(struct page *page, unsigned int order)
 {
 	int i;
 
-	BUG_ON((gfp_flags & (__GFP_WAIT | __GFP_HIGHMEM)) == __GFP_HIGHMEM);
-	for(i = 0; i < (1 << order); i++)
-		clear_highpage(page + i);
+	BUG_ON(PageCompound(page));
+	BUG_ON(!page_count(page));
+	for (i = 1; i < (1 << order); i++)
+		set_page_refcounted(page + i);
 }
 
 /*
@@ -795,14 +818,8 @@ again:
 	put_cpu();
 
 	BUG_ON(bad_range(zone, page));
-	if (prep_new_page(page, order))
+	if (prep_new_page(page, order, gfp_flags))
 		goto again;
-
-	if (gfp_flags & __GFP_ZERO)
-		prep_zero_page(page, order, gfp_flags);
-
-	if (order && (gfp_flags & __GFP_COMP))
-		prep_compound_page(page, order);
 	return page;
 
 failed:
@@ -1214,24 +1231,22 @@ DEFINE_PER_CPU(long, nr_pagecache_local) = 0;
 
 static void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask)
 {
-	int cpu = 0;
+	unsigned cpu;
 
 	memset(ret, 0, nr * sizeof(unsigned long));
 	cpus_and(*cpumask, *cpumask, cpu_online_map);
 
-	cpu = first_cpu(*cpumask);
-	while (cpu < NR_CPUS) {
-		unsigned long *in, *out, off;
-
-		if (!cpu_isset(cpu, *cpumask))
-			continue;
+	for_each_cpu_mask(cpu, *cpumask) {
+		unsigned long *in;
+		unsigned long *out;
+		unsigned off;
+		unsigned next_cpu;
 
 		in = (unsigned long *)&per_cpu(page_states, cpu);
 
-		cpu = next_cpu(cpu, *cpumask);
-
-		if (likely(cpu < NR_CPUS))
-			prefetch(&per_cpu(page_states, cpu));
+		next_cpu = next_cpu(cpu, *cpumask);
+		if (likely(next_cpu < NR_CPUS))
+			prefetch(&per_cpu(page_states, next_cpu));
 
 		out = (unsigned long *)ret;
 		for (off = 0; off < nr; off++)
@@ -1764,7 +1779,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
 			continue;
 		page = pfn_to_page(pfn);
 		set_page_links(page, zone, nid, pfn);
-		set_page_count(page, 1);
+		init_page_count(page);
 		reset_page_mapcount(page);
 		SetPageReserved(page);
 		INIT_LIST_HEAD(&page->lru);
diff --git a/mm/readahead.c b/mm/readahead.c
index 8d6eeaaa629..301b36c4a0c 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -52,13 +52,24 @@ static inline unsigned long get_min_readahead(struct file_ra_state *ra)
 	return (VM_MIN_READAHEAD * 1024) / PAGE_CACHE_SIZE;
 }
 
+static inline void reset_ahead_window(struct file_ra_state *ra)
+{
+	/*
+	 * ... but preserve ahead_start + ahead_size value,
+	 * see 'recheck:' label in page_cache_readahead().
+	 * Note: We never use ->ahead_size as rvalue without
+	 * checking ->ahead_start != 0 first.
+	 */
+	ra->ahead_size += ra->ahead_start;
+	ra->ahead_start = 0;
+}
+
 static inline void ra_off(struct file_ra_state *ra)
 {
 	ra->start = 0;
 	ra->flags = 0;
 	ra->size = 0;
-	ra->ahead_start = 0;
-	ra->ahead_size = 0;
+	reset_ahead_window(ra);
 	return;
 }
 
@@ -72,10 +83,10 @@ static unsigned long get_init_ra_size(unsigned long size, unsigned long max)
 {
 	unsigned long newsize = roundup_pow_of_two(size);
 
-	if (newsize <= max / 64)
-		newsize = newsize * newsize;
+	if (newsize <= max / 32)
+		newsize = newsize * 4;
 	else if (newsize <= max / 4)
-		newsize = max / 4;
+		newsize = newsize * 2;
 	else
 		newsize = max;
 	return newsize;
@@ -426,8 +437,7 @@ static int make_ahead_window(struct address_space *mapping, struct file *filp,
 		 * congestion.  The ahead window will any way be closed
 		 * in case we failed due to excessive page cache hits.
 		 */
-		ra->ahead_start = 0;
-		ra->ahead_size = 0;
+		reset_ahead_window(ra);
 	}
 
 	return ret;
@@ -520,11 +530,11 @@ page_cache_readahead(struct address_space *mapping, struct file_ra_state *ra,
 	 * If we get here we are doing sequential IO and this was not the first
 	 * occurence (ie we have an existing window)
 	 */
-
 	if (ra->ahead_start == 0) {	 /* no ahead window yet */
 		if (!make_ahead_window(mapping, filp, ra, 0))
-			goto out;
+			goto recheck;
 	}
+
 	/*
 	 * Already have an ahead window, check if we crossed into it.
 	 * If so, shift windows and issue a new ahead window.
@@ -536,6 +546,10 @@ page_cache_readahead(struct address_space *mapping, struct file_ra_state *ra,
 		ra->start = ra->ahead_start;
 		ra->size = ra->ahead_size;
 		make_ahead_window(mapping, filp, ra, 0);
+recheck:
+		/* prev_page shouldn't overrun the ahead window */
+		ra->prev_page = min(ra->prev_page,
+			ra->ahead_start + ra->ahead_size - 1);
 	}
 
 out:
diff --git a/mm/rmap.c b/mm/rmap.c
index 67f0e20b101..1963e269314 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -56,13 +56,11 @@
 
 #include <asm/tlbflush.h>
 
-//#define RMAP_DEBUG /* can be enabled only for debugging */
-
-kmem_cache_t *anon_vma_cachep;
+struct kmem_cache *anon_vma_cachep;
 
 static inline void validate_anon_vma(struct vm_area_struct *find_vma)
 {
-#ifdef RMAP_DEBUG
+#ifdef CONFIG_DEBUG_VM
 	struct anon_vma *anon_vma = find_vma->anon_vma;
 	struct vm_area_struct *vma;
 	unsigned int mapcount = 0;
@@ -166,7 +164,8 @@ void anon_vma_unlink(struct vm_area_struct *vma)
 		anon_vma_free(anon_vma);
 }
 
-static void anon_vma_ctor(void *data, kmem_cache_t *cachep, unsigned long flags)
+static void anon_vma_ctor(void *data, struct kmem_cache *cachep,
+			  unsigned long flags)
 {
 	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
 						SLAB_CTOR_CONSTRUCTOR) {
@@ -550,13 +549,14 @@ void page_add_file_rmap(struct page *page)
 void page_remove_rmap(struct page *page)
 {
 	if (atomic_add_negative(-1, &page->_mapcount)) {
-		if (page_mapcount(page) < 0) {
+#ifdef CONFIG_DEBUG_VM
+		if (unlikely(page_mapcount(page) < 0)) {
 			printk (KERN_EMERG "Eeek! page_mapcount(page) went negative! (%d)\n", page_mapcount(page));
 			printk (KERN_EMERG "  page->flags = %lx\n", page->flags);
 			printk (KERN_EMERG "  page->count = %x\n", page_count(page));
 			printk (KERN_EMERG "  page->mapping = %p\n", page->mapping);
 		}
-
+#endif
 		BUG_ON(page_mapcount(page) < 0);
 		/*
 		 * It would be tidy to reset the PageAnon mapping here,
diff --git a/mm/shmem.c b/mm/shmem.c
index 7c455fbaff7..37eaf42ed2c 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -875,7 +875,7 @@ redirty:
 }
 
 #ifdef CONFIG_NUMA
-static int shmem_parse_mpol(char *value, int *policy, nodemask_t *policy_nodes)
+static inline int shmem_parse_mpol(char *value, int *policy, nodemask_t *policy_nodes)
 {
 	char *nodelist = strchr(value, ':');
 	int err = 1;
@@ -2119,7 +2119,7 @@ failed:
 	return err;
 }
 
-static kmem_cache_t *shmem_inode_cachep;
+static struct kmem_cache *shmem_inode_cachep;
 
 static struct inode *shmem_alloc_inode(struct super_block *sb)
 {
@@ -2139,7 +2139,8 @@ static void shmem_destroy_inode(struct inode *inode)
 	kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
 }
 
-static void init_once(void *foo, kmem_cache_t *cachep, unsigned long flags)
+static void init_once(void *foo, struct kmem_cache *cachep,
+		      unsigned long flags)
 {
 	struct shmem_inode_info *p = (struct shmem_inode_info *) foo;
 
diff --git a/mm/slab.c b/mm/slab.c
index d0bd7f07ab0..1c8f5ee230d 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -50,7 +50,7 @@
  * The head array is strictly LIFO and should improve the cache hit rates.
  * On SMP, it additionally reduces the spinlock operations.
  *
- * The c_cpuarray may not be read with enabled local interrupts - 
+ * The c_cpuarray may not be read with enabled local interrupts -
  * it's changed with a smp_call_function().
  *
  * SMP synchronization:
@@ -170,12 +170,12 @@
 #if DEBUG
 # define CREATE_MASK	(SLAB_DEBUG_INITIAL | SLAB_RED_ZONE | \
 			 SLAB_POISON | SLAB_HWCACHE_ALIGN | \
-			 SLAB_NO_REAP | SLAB_CACHE_DMA | \
+			 SLAB_CACHE_DMA | \
 			 SLAB_MUST_HWCACHE_ALIGN | SLAB_STORE_USER | \
 			 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
 			 SLAB_DESTROY_BY_RCU)
 #else
-# define CREATE_MASK	(SLAB_HWCACHE_ALIGN | SLAB_NO_REAP | \
+# define CREATE_MASK	(SLAB_HWCACHE_ALIGN | \
 			 SLAB_CACHE_DMA | SLAB_MUST_HWCACHE_ALIGN | \
 			 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
 			 SLAB_DESTROY_BY_RCU)
@@ -266,16 +266,17 @@ struct array_cache {
 	unsigned int batchcount;
 	unsigned int touched;
 	spinlock_t lock;
-	void *entry[0];		/*
-				 * Must have this definition in here for the proper
-				 * alignment of array_cache. Also simplifies accessing
-				 * the entries.
-				 * [0] is for gcc 2.95. It should really be [].
-				 */
+	void *entry[0];	/*
+			 * Must have this definition in here for the proper
+			 * alignment of array_cache. Also simplifies accessing
+			 * the entries.
+			 * [0] is for gcc 2.95. It should really be [].
+			 */
 };
 
-/* bootstrap: The caches do not work without cpuarrays anymore,
- * but the cpuarrays are allocated from the generic caches...
+/*
+ * bootstrap: The caches do not work without cpuarrays anymore, but the
+ * cpuarrays are allocated from the generic caches...
  */
 #define BOOT_CPUCACHE_ENTRIES	1
 struct arraycache_init {
@@ -291,13 +292,13 @@ struct kmem_list3 {
 	struct list_head slabs_full;
 	struct list_head slabs_free;
 	unsigned long free_objects;
-	unsigned long next_reap;
-	int free_touched;
 	unsigned int free_limit;
 	unsigned int colour_next;	/* Per-node cache coloring */
 	spinlock_t list_lock;
 	struct array_cache *shared;	/* shared per node */
 	struct array_cache **alien;	/* on other nodes */
+	unsigned long next_reap;	/* updated without locking */
+	int free_touched;		/* updated without locking */
 };
 
 /*
@@ -310,10 +311,8 @@ struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS];
 #define	SIZE_L3 (1 + MAX_NUMNODES)
 
 /*
- * This function must be completely optimized away if
- * a constant is passed to it. Mostly the same as
- * what is in linux/slab.h except it returns an
- * index.
+ * This function must be completely optimized away if a constant is passed to
+ * it.  Mostly the same as what is in linux/slab.h except it returns an index.
  */
 static __always_inline int index_of(const size_t size)
 {
@@ -351,14 +350,14 @@ static void kmem_list3_init(struct kmem_list3 *parent)
 	parent->free_touched = 0;
 }
 
-#define MAKE_LIST(cachep, listp, slab, nodeid)	\
-	do {	\
-		INIT_LIST_HEAD(listp);		\
-		list_splice(&(cachep->nodelists[nodeid]->slab), listp); \
+#define MAKE_LIST(cachep, listp, slab, nodeid)				\
+	do {								\
+		INIT_LIST_HEAD(listp);					\
+		list_splice(&(cachep->nodelists[nodeid]->slab), listp);	\
 	} while (0)
 
-#define	MAKE_ALL_LISTS(cachep, ptr, nodeid)			\
-	do {					\
+#define	MAKE_ALL_LISTS(cachep, ptr, nodeid)				\
+	do {								\
 	MAKE_LIST((cachep), (&(ptr)->slabs_full), slabs_full, nodeid);	\
 	MAKE_LIST((cachep), (&(ptr)->slabs_partial), slabs_partial, nodeid); \
 	MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid);	\
@@ -373,28 +372,30 @@ static void kmem_list3_init(struct kmem_list3 *parent)
 struct kmem_cache {
 /* 1) per-cpu data, touched during every alloc/free */
 	struct array_cache *array[NR_CPUS];
+/* 2) Cache tunables. Protected by cache_chain_mutex */
 	unsigned int batchcount;
 	unsigned int limit;
 	unsigned int shared;
+
 	unsigned int buffer_size;
-/* 2) touched by every alloc & free from the backend */
+/* 3) touched by every alloc & free from the backend */
 	struct kmem_list3 *nodelists[MAX_NUMNODES];
-	unsigned int flags;	/* constant flags */
-	unsigned int num;	/* # of objs per slab */
-	spinlock_t spinlock;
 
-/* 3) cache_grow/shrink */
+	unsigned int flags;		/* constant flags */
+	unsigned int num;		/* # of objs per slab */
+
+/* 4) cache_grow/shrink */
 	/* order of pgs per slab (2^n) */
 	unsigned int gfporder;
 
 	/* force GFP flags, e.g. GFP_DMA */
 	gfp_t gfpflags;
 
-	size_t colour;		/* cache colouring range */
+	size_t colour;			/* cache colouring range */
 	unsigned int colour_off;	/* colour offset */
 	struct kmem_cache *slabp_cache;
 	unsigned int slab_size;
-	unsigned int dflags;	/* dynamic flags */
+	unsigned int dflags;		/* dynamic flags */
 
 	/* constructor func */
 	void (*ctor) (void *, struct kmem_cache *, unsigned long);
@@ -402,11 +403,11 @@ struct kmem_cache {
 	/* de-constructor func */
 	void (*dtor) (void *, struct kmem_cache *, unsigned long);
 
-/* 4) cache creation/removal */
+/* 5) cache creation/removal */
 	const char *name;
 	struct list_head next;
 
-/* 5) statistics */
+/* 6) statistics */
 #if STATS
 	unsigned long num_active;
 	unsigned long num_allocations;
@@ -438,8 +439,9 @@ struct kmem_cache {
 #define	OFF_SLAB(x)	((x)->flags & CFLGS_OFF_SLAB)
 
 #define BATCHREFILL_LIMIT	16
-/* Optimization question: fewer reaps means less 
- * probability for unnessary cpucache drain/refill cycles.
+/*
+ * Optimization question: fewer reaps means less probability for unnessary
+ * cpucache drain/refill cycles.
  *
  * OTOH the cpuarrays can contain lots of objects,
  * which could lock up otherwise freeable slabs.
@@ -453,17 +455,19 @@ struct kmem_cache {
 #define	STATS_INC_ALLOCED(x)	((x)->num_allocations++)
 #define	STATS_INC_GROWN(x)	((x)->grown++)
 #define	STATS_INC_REAPED(x)	((x)->reaped++)
-#define	STATS_SET_HIGH(x)	do { if ((x)->num_active > (x)->high_mark) \
-					(x)->high_mark = (x)->num_active; \
-				} while (0)
+#define	STATS_SET_HIGH(x)						\
+	do {								\
+		if ((x)->num_active > (x)->high_mark)			\
+			(x)->high_mark = (x)->num_active;		\
+	} while (0)
 #define	STATS_INC_ERR(x)	((x)->errors++)
 #define	STATS_INC_NODEALLOCS(x)	((x)->node_allocs++)
 #define	STATS_INC_NODEFREES(x)	((x)->node_frees++)
-#define	STATS_SET_FREEABLE(x, i) \
-				do { if ((x)->max_freeable < i) \
-					(x)->max_freeable = i; \
-				} while (0)
-
+#define	STATS_SET_FREEABLE(x, i)					\
+	do {								\
+		if ((x)->max_freeable < i)				\
+			(x)->max_freeable = i;				\
+	} while (0)
 #define STATS_INC_ALLOCHIT(x)	atomic_inc(&(x)->allochit)
 #define STATS_INC_ALLOCMISS(x)	atomic_inc(&(x)->allocmiss)
 #define STATS_INC_FREEHIT(x)	atomic_inc(&(x)->freehit)
@@ -478,9 +482,7 @@ struct kmem_cache {
 #define	STATS_INC_ERR(x)	do { } while (0)
 #define	STATS_INC_NODEALLOCS(x)	do { } while (0)
 #define	STATS_INC_NODEFREES(x)	do { } while (0)
-#define	STATS_SET_FREEABLE(x, i) \
-				do { } while (0)
-
+#define	STATS_SET_FREEABLE(x, i) do { } while (0)
 #define STATS_INC_ALLOCHIT(x)	do { } while (0)
 #define STATS_INC_ALLOCMISS(x)	do { } while (0)
 #define STATS_INC_FREEHIT(x)	do { } while (0)
@@ -488,7 +490,8 @@ struct kmem_cache {
 #endif
 
 #if DEBUG
-/* Magic nums for obj red zoning.
+/*
+ * Magic nums for obj red zoning.
  * Placed in the first word before and the first word after an obj.
  */
 #define	RED_INACTIVE	0x5A2CF071UL	/* when obj is inactive */
@@ -499,7 +502,8 @@ struct kmem_cache {
 #define POISON_FREE	0x6b	/* for use-after-free poisoning */
 #define	POISON_END	0xa5	/* end-byte of poisoning */
 
-/* memory layout of objects:
+/*
+ * memory layout of objects:
  * 0		: objp
  * 0 .. cachep->obj_offset - BYTES_PER_WORD - 1: padding. This ensures that
  * 		the end of an object is aligned with the end of the real
@@ -508,7 +512,8 @@ struct kmem_cache {
  * 		redzone word.
  * cachep->obj_offset: The real object.
  * cachep->buffer_size - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long]
- * cachep->buffer_size - 1* BYTES_PER_WORD: last caller address [BYTES_PER_WORD long]
+ * cachep->buffer_size - 1* BYTES_PER_WORD: last caller address
+ *					[BYTES_PER_WORD long]
  */
 static int obj_offset(struct kmem_cache *cachep)
 {
@@ -552,8 +557,8 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp)
 #endif
 
 /*
- * Maximum size of an obj (in 2^order pages)
- * and absolute limit for the gfp order.
+ * Maximum size of an obj (in 2^order pages) and absolute limit for the gfp
+ * order.
  */
 #if defined(CONFIG_LARGE_ALLOCS)
 #define	MAX_OBJ_ORDER	13	/* up to 32Mb */
@@ -573,9 +578,10 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp)
 #define	BREAK_GFP_ORDER_LO	0
 static int slab_break_gfp_order = BREAK_GFP_ORDER_LO;
 
-/* Functions for storing/retrieving the cachep and or slab from the
- * global 'mem_map'. These are used to find the slab an obj belongs to.
- * With kfree(), these are used to find the cache which an obj belongs to.
+/*
+ * Functions for storing/retrieving the cachep and or slab from the page
+ * allocator.  These are used to find the slab an obj belongs to.  With kfree(),
+ * these are used to find the cache which an obj belongs to.
  */
 static inline void page_set_cache(struct page *page, struct kmem_cache *cache)
 {
@@ -584,6 +590,8 @@ static inline void page_set_cache(struct page *page, struct kmem_cache *cache)
 
 static inline struct kmem_cache *page_get_cache(struct page *page)
 {
+	if (unlikely(PageCompound(page)))
+		page = (struct page *)page_private(page);
 	return (struct kmem_cache *)page->lru.next;
 }
 
@@ -594,6 +602,8 @@ static inline void page_set_slab(struct page *page, struct slab *slab)
 
 static inline struct slab *page_get_slab(struct page *page)
 {
+	if (unlikely(PageCompound(page)))
+		page = (struct page *)page_private(page);
 	return (struct slab *)page->lru.prev;
 }
 
@@ -609,7 +619,21 @@ static inline struct slab *virt_to_slab(const void *obj)
 	return page_get_slab(page);
 }
 
-/* These are the default caches for kmalloc. Custom caches can have other sizes. */
+static inline void *index_to_obj(struct kmem_cache *cache, struct slab *slab,
+				 unsigned int idx)
+{
+	return slab->s_mem + cache->buffer_size * idx;
+}
+
+static inline unsigned int obj_to_index(struct kmem_cache *cache,
+					struct slab *slab, void *obj)
+{
+	return (unsigned)(obj - slab->s_mem) / cache->buffer_size;
+}
+
+/*
+ * These are the default caches for kmalloc. Custom caches can have other sizes.
+ */
 struct cache_sizes malloc_sizes[] = {
 #define CACHE(x) { .cs_size = (x) },
 #include <linux/kmalloc_sizes.h>
@@ -642,8 +666,6 @@ static struct kmem_cache cache_cache = {
 	.limit = BOOT_CPUCACHE_ENTRIES,
 	.shared = 1,
 	.buffer_size = sizeof(struct kmem_cache),
-	.flags = SLAB_NO_REAP,
-	.spinlock = SPIN_LOCK_UNLOCKED,
 	.name = "kmem_cache",
 #if DEBUG
 	.obj_size = sizeof(struct kmem_cache),
@@ -655,8 +677,8 @@ static DEFINE_MUTEX(cache_chain_mutex);
 static struct list_head cache_chain;
 
 /*
- * vm_enough_memory() looks at this to determine how many
- * slab-allocated pages are possibly freeable under pressure
+ * vm_enough_memory() looks at this to determine how many slab-allocated pages
+ * are possibly freeable under pressure
  *
  * SLAB_RECLAIM_ACCOUNT turns this on per-slab
  */
@@ -675,7 +697,8 @@ static enum {
 
 static DEFINE_PER_CPU(struct work_struct, reap_work);
 
-static void free_block(struct kmem_cache *cachep, void **objpp, int len, int node);
+static void free_block(struct kmem_cache *cachep, void **objpp, int len,
+			int node);
 static void enable_cpucache(struct kmem_cache *cachep);
 static void cache_reap(void *unused);
 static int __node_shrink(struct kmem_cache *cachep, int node);
@@ -685,7 +708,8 @@ static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
 	return cachep->array[smp_processor_id()];
 }
 
-static inline struct kmem_cache *__find_general_cachep(size_t size, gfp_t gfpflags)
+static inline struct kmem_cache *__find_general_cachep(size_t size,
+							gfp_t gfpflags)
 {
 	struct cache_sizes *csizep = malloc_sizes;
 
@@ -720,8 +744,9 @@ static size_t slab_mgmt_size(size_t nr_objs, size_t align)
 	return ALIGN(sizeof(struct slab)+nr_objs*sizeof(kmem_bufctl_t), align);
 }
 
-/* Calculate the number of objects and left-over bytes for a given
-   buffer size. */
+/*
+ * Calculate the number of objects and left-over bytes for a given buffer size.
+ */
 static void cache_estimate(unsigned long gfporder, size_t buffer_size,
 			   size_t align, int flags, size_t *left_over,
 			   unsigned int *num)
@@ -782,7 +807,8 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size,
 
 #define slab_error(cachep, msg) __slab_error(__FUNCTION__, cachep, msg)
 
-static void __slab_error(const char *function, struct kmem_cache *cachep, char *msg)
+static void __slab_error(const char *function, struct kmem_cache *cachep,
+			char *msg)
 {
 	printk(KERN_ERR "slab error in %s(): cache `%s': %s\n",
 	       function, cachep->name, msg);
@@ -804,7 +830,7 @@ static void init_reap_node(int cpu)
 
 	node = next_node(cpu_to_node(cpu), node_online_map);
 	if (node == MAX_NUMNODES)
-		node = 0;
+		node = first_node(node_online_map);
 
 	__get_cpu_var(reap_node) = node;
 }
@@ -906,10 +932,8 @@ static void free_alien_cache(struct array_cache **ac_ptr)
 
 	if (!ac_ptr)
 		return;
-
 	for_each_node(i)
 	    kfree(ac_ptr[i]);
-
 	kfree(ac_ptr);
 }
 
@@ -943,7 +967,8 @@ static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3)
 	}
 }
 
-static void drain_alien_cache(struct kmem_cache *cachep, struct array_cache **alien)
+static void drain_alien_cache(struct kmem_cache *cachep,
+				struct array_cache **alien)
 {
 	int i = 0;
 	struct array_cache *ac;
@@ -986,20 +1011,22 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
 	switch (action) {
 	case CPU_UP_PREPARE:
 		mutex_lock(&cache_chain_mutex);
-		/* we need to do this right in the beginning since
+		/*
+		 * We need to do this right in the beginning since
 		 * alloc_arraycache's are going to use this list.
 		 * kmalloc_node allows us to add the slab to the right
 		 * kmem_list3 and not this cpu's kmem_list3
 		 */
 
 		list_for_each_entry(cachep, &cache_chain, next) {
-			/* setup the size64 kmemlist for cpu before we can
+			/*
+			 * Set up the size64 kmemlist for cpu before we can
 			 * begin anything. Make sure some other cpu on this
 			 * node has not already allocated this
 			 */
 			if (!cachep->nodelists[node]) {
-				if (!(l3 = kmalloc_node(memsize,
-							GFP_KERNEL, node)))
+				l3 = kmalloc_node(memsize, GFP_KERNEL, node);
+				if (!l3)
 					goto bad;
 				kmem_list3_init(l3);
 				l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
@@ -1015,13 +1042,15 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
 
 			spin_lock_irq(&cachep->nodelists[node]->list_lock);
 			cachep->nodelists[node]->free_limit =
-			    (1 + nr_cpus_node(node)) *
-			    cachep->batchcount + cachep->num;
+				(1 + nr_cpus_node(node)) *
+				cachep->batchcount + cachep->num;
 			spin_unlock_irq(&cachep->nodelists[node]->list_lock);
 		}
 
-		/* Now we can go ahead with allocating the shared array's
-		   & array cache's */
+		/*
+		 * Now we can go ahead with allocating the shared arrays and
+		 * array caches
+		 */
 		list_for_each_entry(cachep, &cache_chain, next) {
 			struct array_cache *nc;
 			struct array_cache *shared;
@@ -1041,7 +1070,6 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
 			if (!alien)
 				goto bad;
 			cachep->array[cpu] = nc;
-
 			l3 = cachep->nodelists[node];
 			BUG_ON(!l3);
 
@@ -1061,7 +1089,6 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
 			}
 #endif
 			spin_unlock_irq(&l3->list_lock);
-
 			kfree(shared);
 			free_alien_cache(alien);
 		}
@@ -1083,7 +1110,6 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
 		/* fall thru */
 	case CPU_UP_CANCELED:
 		mutex_lock(&cache_chain_mutex);
-
 		list_for_each_entry(cachep, &cache_chain, next) {
 			struct array_cache *nc;
 			struct array_cache *shared;
@@ -1150,7 +1176,7 @@ free_array_cache:
 #endif
 	}
 	return NOTIFY_OK;
-      bad:
+bad:
 	mutex_unlock(&cache_chain_mutex);
 	return NOTIFY_BAD;
 }
@@ -1160,7 +1186,8 @@ static struct notifier_block cpucache_notifier = { &cpuup_callback, NULL, 0 };
 /*
  * swap the static kmem_list3 with kmalloced memory
  */
-static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list, int nodeid)
+static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list,
+			int nodeid)
 {
 	struct kmem_list3 *ptr;
 
@@ -1175,8 +1202,9 @@ static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list, int no
 	local_irq_enable();
 }
 
-/* Initialisation.
- * Called after the gfp() functions have been enabled, and before smp_init().
+/*
+ * Initialisation.  Called after the page allocator have been initialised and
+ * before smp_init().
  */
 void __init kmem_cache_init(void)
 {
@@ -1201,9 +1229,9 @@ void __init kmem_cache_init(void)
 
 	/* Bootstrap is tricky, because several objects are allocated
 	 * from caches that do not exist yet:
-	 * 1) initialize the cache_cache cache: it contains the struct kmem_cache
-	 *    structures of all caches, except cache_cache itself: cache_cache
-	 *    is statically allocated.
+	 * 1) initialize the cache_cache cache: it contains the struct
+	 *    kmem_cache structures of all caches, except cache_cache itself:
+	 *    cache_cache is statically allocated.
 	 *    Initially an __init data area is used for the head array and the
 	 *    kmem_list3 structures, it's replaced with a kmalloc allocated
 	 *    array at the end of the bootstrap.
@@ -1226,7 +1254,8 @@ void __init kmem_cache_init(void)
 	cache_cache.array[smp_processor_id()] = &initarray_cache.cache;
 	cache_cache.nodelists[numa_node_id()] = &initkmem_list3[CACHE_CACHE];
 
-	cache_cache.buffer_size = ALIGN(cache_cache.buffer_size, cache_line_size());
+	cache_cache.buffer_size = ALIGN(cache_cache.buffer_size,
+					cache_line_size());
 
 	for (order = 0; order < MAX_ORDER; order++) {
 		cache_estimate(order, cache_cache.buffer_size,
@@ -1245,24 +1274,26 @@ void __init kmem_cache_init(void)
 	sizes = malloc_sizes;
 	names = cache_names;
 
-	/* Initialize the caches that provide memory for the array cache
-	 * and the kmem_list3 structures first.
-	 * Without this, further allocations will bug
+	/*
+	 * Initialize the caches that provide memory for the array cache and the
+	 * kmem_list3 structures first.  Without this, further allocations will
+	 * bug.
 	 */
 
 	sizes[INDEX_AC].cs_cachep = kmem_cache_create(names[INDEX_AC].name,
-						      sizes[INDEX_AC].cs_size,
-						      ARCH_KMALLOC_MINALIGN,
-						      (ARCH_KMALLOC_FLAGS |
-						       SLAB_PANIC), NULL, NULL);
+					sizes[INDEX_AC].cs_size,
+					ARCH_KMALLOC_MINALIGN,
+					ARCH_KMALLOC_FLAGS|SLAB_PANIC,
+					NULL, NULL);
 
-	if (INDEX_AC != INDEX_L3)
+	if (INDEX_AC != INDEX_L3) {
 		sizes[INDEX_L3].cs_cachep =
-		    kmem_cache_create(names[INDEX_L3].name,
-				      sizes[INDEX_L3].cs_size,
-				      ARCH_KMALLOC_MINALIGN,
-				      (ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL,
-				      NULL);
+			kmem_cache_create(names[INDEX_L3].name,
+				sizes[INDEX_L3].cs_size,
+				ARCH_KMALLOC_MINALIGN,
+				ARCH_KMALLOC_FLAGS|SLAB_PANIC,
+				NULL, NULL);
+	}
 
 	while (sizes->cs_size != ULONG_MAX) {
 		/*
@@ -1272,13 +1303,13 @@ void __init kmem_cache_init(void)
 		 * Note for systems short on memory removing the alignment will
 		 * allow tighter packing of the smaller caches.
 		 */
-		if (!sizes->cs_cachep)
+		if (!sizes->cs_cachep) {
 			sizes->cs_cachep = kmem_cache_create(names->name,
-							     sizes->cs_size,
-							     ARCH_KMALLOC_MINALIGN,
-							     (ARCH_KMALLOC_FLAGS
-							      | SLAB_PANIC),
-							     NULL, NULL);
+					sizes->cs_size,
+					ARCH_KMALLOC_MINALIGN,
+					ARCH_KMALLOC_FLAGS|SLAB_PANIC,
+					NULL, NULL);
+		}
 
 		/* Inc off-slab bufctl limit until the ceiling is hit. */
 		if (!(OFF_SLAB(sizes->cs_cachep))) {
@@ -1287,13 +1318,11 @@ void __init kmem_cache_init(void)
 		}
 
 		sizes->cs_dmacachep = kmem_cache_create(names->name_dma,
-							sizes->cs_size,
-							ARCH_KMALLOC_MINALIGN,
-							(ARCH_KMALLOC_FLAGS |
-							 SLAB_CACHE_DMA |
-							 SLAB_PANIC), NULL,
-							NULL);
-
+					sizes->cs_size,
+					ARCH_KMALLOC_MINALIGN,
+					ARCH_KMALLOC_FLAGS|SLAB_CACHE_DMA|
+						SLAB_PANIC,
+					NULL, NULL);
 		sizes++;
 		names++;
 	}
@@ -1345,20 +1374,22 @@ void __init kmem_cache_init(void)
 		struct kmem_cache *cachep;
 		mutex_lock(&cache_chain_mutex);
 		list_for_each_entry(cachep, &cache_chain, next)
-		    enable_cpucache(cachep);
+			enable_cpucache(cachep);
 		mutex_unlock(&cache_chain_mutex);
 	}
 
 	/* Done! */
 	g_cpucache_up = FULL;
 
-	/* Register a cpu startup notifier callback
-	 * that initializes cpu_cache_get for all new cpus
+	/*
+	 * Register a cpu startup notifier callback that initializes
+	 * cpu_cache_get for all new cpus
 	 */
 	register_cpu_notifier(&cpucache_notifier);
 
-	/* The reap timers are started later, with a module init call:
-	 * That part of the kernel is not yet operational.
+	/*
+	 * The reap timers are started later, with a module init call: That part
+	 * of the kernel is not yet operational.
 	 */
 }
 
@@ -1366,16 +1397,13 @@ static int __init cpucache_init(void)
 {
 	int cpu;
 
-	/* 
-	 * Register the timers that return unneeded
-	 * pages to gfp.
+	/*
+	 * Register the timers that return unneeded pages to the page allocator
 	 */
 	for_each_online_cpu(cpu)
-	    start_cpu_timer(cpu);
-
+		start_cpu_timer(cpu);
 	return 0;
 }
-
 __initcall(cpucache_init);
 
 /*
@@ -1402,7 +1430,7 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
 		atomic_add(i, &slab_reclaim_pages);
 	add_page_state(nr_slab, i);
 	while (i--) {
-		SetPageSlab(page);
+		__SetPageSlab(page);
 		page++;
 	}
 	return addr;
@@ -1418,8 +1446,8 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr)
 	const unsigned long nr_freed = i;
 
 	while (i--) {
-		if (!TestClearPageSlab(page))
-			BUG();
+		BUG_ON(!PageSlab(page));
+		__ClearPageSlab(page);
 		page++;
 	}
 	sub_page_state(nr_slab, nr_freed);
@@ -1489,9 +1517,8 @@ static void dump_line(char *data, int offset, int limit)
 {
 	int i;
 	printk(KERN_ERR "%03x:", offset);
-	for (i = 0; i < limit; i++) {
+	for (i = 0; i < limit; i++)
 		printk(" %02x", (unsigned char)data[offset + i]);
-	}
 	printk("\n");
 }
 #endif
@@ -1505,15 +1532,15 @@ static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines)
 
 	if (cachep->flags & SLAB_RED_ZONE) {
 		printk(KERN_ERR "Redzone: 0x%lx/0x%lx.\n",
-		       *dbg_redzone1(cachep, objp),
-		       *dbg_redzone2(cachep, objp));
+			*dbg_redzone1(cachep, objp),
+			*dbg_redzone2(cachep, objp));
 	}
 
 	if (cachep->flags & SLAB_STORE_USER) {
 		printk(KERN_ERR "Last user: [<%p>]",
-		       *dbg_userword(cachep, objp));
+			*dbg_userword(cachep, objp));
 		print_symbol("(%s)",
-			     (unsigned long)*dbg_userword(cachep, objp));
+				(unsigned long)*dbg_userword(cachep, objp));
 		printk("\n");
 	}
 	realobj = (char *)objp + obj_offset(cachep);
@@ -1546,8 +1573,8 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp)
 			/* Print header */
 			if (lines == 0) {
 				printk(KERN_ERR
-				       "Slab corruption: start=%p, len=%d\n",
-				       realobj, size);
+					"Slab corruption: start=%p, len=%d\n",
+					realobj, size);
 				print_objinfo(cachep, objp, 0);
 			}
 			/* Hexdump the affected line */
@@ -1568,18 +1595,18 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp)
 		 * exist:
 		 */
 		struct slab *slabp = virt_to_slab(objp);
-		int objnr;
+		unsigned int objnr;
 
-		objnr = (unsigned)(objp - slabp->s_mem) / cachep->buffer_size;
+		objnr = obj_to_index(cachep, slabp, objp);
 		if (objnr) {
-			objp = slabp->s_mem + (objnr - 1) * cachep->buffer_size;
+			objp = index_to_obj(cachep, slabp, objnr - 1);
 			realobj = (char *)objp + obj_offset(cachep);
 			printk(KERN_ERR "Prev obj: start=%p, len=%d\n",
 			       realobj, size);
 			print_objinfo(cachep, objp, 2);
 		}
 		if (objnr + 1 < cachep->num) {
-			objp = slabp->s_mem + (objnr + 1) * cachep->buffer_size;
+			objp = index_to_obj(cachep, slabp, objnr + 1);
 			realobj = (char *)objp + obj_offset(cachep);
 			printk(KERN_ERR "Next obj: start=%p, len=%d\n",
 			       realobj, size);
@@ -1591,22 +1618,25 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp)
 
 #if DEBUG
 /**
- * slab_destroy_objs - call the registered destructor for each object in
- *      a slab that is to be destroyed.
+ * slab_destroy_objs - destroy a slab and its objects
+ * @cachep: cache pointer being destroyed
+ * @slabp: slab pointer being destroyed
+ *
+ * Call the registered destructor for each object in a slab that is being
+ * destroyed.
  */
 static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp)
 {
 	int i;
 	for (i = 0; i < cachep->num; i++) {
-		void *objp = slabp->s_mem + cachep->buffer_size * i;
+		void *objp = index_to_obj(cachep, slabp, i);
 
 		if (cachep->flags & SLAB_POISON) {
 #ifdef CONFIG_DEBUG_PAGEALLOC
-			if ((cachep->buffer_size % PAGE_SIZE) == 0
-			    && OFF_SLAB(cachep))
+			if (cachep->buffer_size % PAGE_SIZE == 0 &&
+					OFF_SLAB(cachep))
 				kernel_map_pages(virt_to_page(objp),
-						 cachep->buffer_size / PAGE_SIZE,
-						 1);
+					cachep->buffer_size / PAGE_SIZE, 1);
 			else
 				check_poison_obj(cachep, objp);
 #else
@@ -1631,7 +1661,7 @@ static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp)
 	if (cachep->dtor) {
 		int i;
 		for (i = 0; i < cachep->num; i++) {
-			void *objp = slabp->s_mem + cachep->buffer_size * i;
+			void *objp = index_to_obj(cachep, slabp, i);
 			(cachep->dtor) (objp, cachep, 0);
 		}
 	}
@@ -1639,9 +1669,13 @@ static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp)
 #endif
 
 /**
+ * slab_destroy - destroy and release all objects in a slab
+ * @cachep: cache pointer being destroyed
+ * @slabp: slab pointer being destroyed
+ *
  * Destroy all the objs in a slab, and release the mem back to the system.
- * Before calling the slab must have been unlinked from the cache.
- * The cache-lock is not held/needed.
+ * Before calling the slab must have been unlinked from the cache.  The
+ * cache-lock is not held/needed.
  */
 static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp)
 {
@@ -1662,8 +1696,10 @@ static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp)
 	}
 }
 
-/* For setting up all the kmem_list3s for cache whose buffer_size is same
-   as size of kmem_list3. */
+/*
+ * For setting up all the kmem_list3s for cache whose buffer_size is same as
+ * size of kmem_list3.
+ */
 static void set_up_list3s(struct kmem_cache *cachep, int index)
 {
 	int node;
@@ -1689,13 +1725,13 @@ static void set_up_list3s(struct kmem_cache *cachep, int index)
  * high order pages for slabs.  When the gfp() functions are more friendly
  * towards high-order requests, this should be changed.
  */
-static inline size_t calculate_slab_order(struct kmem_cache *cachep,
+static size_t calculate_slab_order(struct kmem_cache *cachep,
 			size_t size, size_t align, unsigned long flags)
 {
 	size_t left_over = 0;
 	int gfporder;
 
-	for (gfporder = 0 ; gfporder <= MAX_GFP_ORDER; gfporder++) {
+	for (gfporder = 0; gfporder <= MAX_GFP_ORDER; gfporder++) {
 		unsigned int num;
 		size_t remainder;
 
@@ -1730,12 +1766,66 @@ static inline size_t calculate_slab_order(struct kmem_cache *cachep,
 		/*
 		 * Acceptable internal fragmentation?
 		 */
-		if ((left_over * 8) <= (PAGE_SIZE << gfporder))
+		if (left_over * 8 <= (PAGE_SIZE << gfporder))
 			break;
 	}
 	return left_over;
 }
 
+static void setup_cpu_cache(struct kmem_cache *cachep)
+{
+	if (g_cpucache_up == FULL) {
+		enable_cpucache(cachep);
+		return;
+	}
+	if (g_cpucache_up == NONE) {
+		/*
+		 * Note: the first kmem_cache_create must create the cache
+		 * that's used by kmalloc(24), otherwise the creation of
+		 * further caches will BUG().
+		 */
+		cachep->array[smp_processor_id()] = &initarray_generic.cache;
+
+		/*
+		 * If the cache that's used by kmalloc(sizeof(kmem_list3)) is
+		 * the first cache, then we need to set up all its list3s,
+		 * otherwise the creation of further caches will BUG().
+		 */
+		set_up_list3s(cachep, SIZE_AC);
+		if (INDEX_AC == INDEX_L3)
+			g_cpucache_up = PARTIAL_L3;
+		else
+			g_cpucache_up = PARTIAL_AC;
+	} else {
+		cachep->array[smp_processor_id()] =
+			kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
+
+		if (g_cpucache_up == PARTIAL_AC) {
+			set_up_list3s(cachep, SIZE_L3);
+			g_cpucache_up = PARTIAL_L3;
+		} else {
+			int node;
+			for_each_online_node(node) {
+				cachep->nodelists[node] =
+				    kmalloc_node(sizeof(struct kmem_list3),
+						GFP_KERNEL, node);
+				BUG_ON(!cachep->nodelists[node]);
+				kmem_list3_init(cachep->nodelists[node]);
+			}
+		}
+	}
+	cachep->nodelists[numa_node_id()]->next_reap =
+			jiffies + REAPTIMEOUT_LIST3 +
+			((unsigned long)cachep) % REAPTIMEOUT_LIST3;
+
+	cpu_cache_get(cachep)->avail = 0;
+	cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
+	cpu_cache_get(cachep)->batchcount = 1;
+	cpu_cache_get(cachep)->touched = 0;
+	cachep->batchcount = 1;
+	cachep->limit = BOOT_CPUCACHE_ENTRIES;
+}
+
 /**
  * kmem_cache_create - Create a cache.
  * @name: A string which is used in /proc/slabinfo to identify this cache.
@@ -1751,9 +1841,8 @@ static inline size_t calculate_slab_order(struct kmem_cache *cachep,
  * and the @dtor is run before the pages are handed back.
  *
  * @name must be valid until the cache is destroyed. This implies that
- * the module calling this has to destroy the cache before getting 
- * unloaded.
- * 
+ * the module calling this has to destroy the cache before getting unloaded.
+ *
  * The flags are
  *
  * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)
@@ -1762,16 +1851,14 @@ static inline size_t calculate_slab_order(struct kmem_cache *cachep,
  * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check
  * for buffer overruns.
  *
- * %SLAB_NO_REAP - Don't automatically reap this cache when we're under
- * memory pressure.
- *
  * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
  * cacheline.  This can be beneficial if you're counting cycles as closely
  * as davem.
  */
 struct kmem_cache *
 kmem_cache_create (const char *name, size_t size, size_t align,
-	unsigned long flags, void (*ctor)(void*, struct kmem_cache *, unsigned long),
+	unsigned long flags,
+	void (*ctor)(void*, struct kmem_cache *, unsigned long),
 	void (*dtor)(void*, struct kmem_cache *, unsigned long))
 {
 	size_t left_over, slab_size, ralign;
@@ -1781,12 +1868,10 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 	/*
 	 * Sanity checks... these are all serious usage bugs.
 	 */
-	if ((!name) ||
-	    in_interrupt() ||
-	    (size < BYTES_PER_WORD) ||
+	if (!name || in_interrupt() || (size < BYTES_PER_WORD) ||
 	    (size > (1 << MAX_OBJ_ORDER) * PAGE_SIZE) || (dtor && !ctor)) {
-		printk(KERN_ERR "%s: Early error in slab %s\n",
-		       __FUNCTION__, name);
+		printk(KERN_ERR "%s: Early error in slab %s\n", __FUNCTION__,
+				name);
 		BUG();
 	}
 
@@ -1840,8 +1925,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 	 * above the next power of two: caches with object sizes just above a
 	 * power of two have a significant amount of internal fragmentation.
 	 */
-	if ((size < 4096
-	     || fls(size - 1) == fls(size - 1 + 3 * BYTES_PER_WORD)))
+	if (size < 4096 || fls(size - 1) == fls(size-1 + 3 * BYTES_PER_WORD))
 		flags |= SLAB_RED_ZONE | SLAB_STORE_USER;
 	if (!(flags & SLAB_DESTROY_BY_RCU))
 		flags |= SLAB_POISON;
@@ -1853,13 +1937,14 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 		BUG_ON(dtor);
 
 	/*
-	 * Always checks flags, a caller might be expecting debug
-	 * support which isn't available.
+	 * Always checks flags, a caller might be expecting debug support which
+	 * isn't available.
 	 */
 	if (flags & ~CREATE_MASK)
 		BUG();
 
-	/* Check that size is in terms of words.  This is needed to avoid
+	/*
+	 * Check that size is in terms of words.  This is needed to avoid
 	 * unaligned accesses for some archs when redzoning is used, and makes
 	 * sure any on-slab bufctl's are also correctly aligned.
 	 */
@@ -1868,12 +1953,14 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 		size &= ~(BYTES_PER_WORD - 1);
 	}
 
-	/* calculate out the final buffer alignment: */
+	/* calculate the final buffer alignment: */
+
 	/* 1) arch recommendation: can be overridden for debug */
 	if (flags & SLAB_HWCACHE_ALIGN) {
-		/* Default alignment: as specified by the arch code.
-		 * Except if an object is really small, then squeeze multiple
-		 * objects into one cacheline.
+		/*
+		 * Default alignment: as specified by the arch code.  Except if
+		 * an object is really small, then squeeze multiple objects into
+		 * one cacheline.
 		 */
 		ralign = cache_line_size();
 		while (size <= ralign / 2)
@@ -1893,7 +1980,8 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 		if (ralign > BYTES_PER_WORD)
 			flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
 	}
-	/* 4) Store it. Note that the debug code below can reduce
+	/*
+	 * 4) Store it. Note that the debug code below can reduce
 	 *    the alignment to BYTES_PER_WORD.
 	 */
 	align = ralign;
@@ -1978,7 +2066,6 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 	cachep->gfpflags = 0;
 	if (flags & SLAB_CACHE_DMA)
 		cachep->gfpflags |= GFP_DMA;
-	spin_lock_init(&cachep->spinlock);
 	cachep->buffer_size = size;
 
 	if (flags & CFLGS_OFF_SLAB)
@@ -1988,64 +2075,11 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 	cachep->name = name;
 
 
-	if (g_cpucache_up == FULL) {
-		enable_cpucache(cachep);
-	} else {
-		if (g_cpucache_up == NONE) {
-			/* Note: the first kmem_cache_create must create
-			 * the cache that's used by kmalloc(24), otherwise
-			 * the creation of further caches will BUG().
-			 */
-			cachep->array[smp_processor_id()] =
-			    &initarray_generic.cache;
-
-			/* If the cache that's used by
-			 * kmalloc(sizeof(kmem_list3)) is the first cache,
-			 * then we need to set up all its list3s, otherwise
-			 * the creation of further caches will BUG().
-			 */
-			set_up_list3s(cachep, SIZE_AC);
-			if (INDEX_AC == INDEX_L3)
-				g_cpucache_up = PARTIAL_L3;
-			else
-				g_cpucache_up = PARTIAL_AC;
-		} else {
-			cachep->array[smp_processor_id()] =
-			    kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
-
-			if (g_cpucache_up == PARTIAL_AC) {
-				set_up_list3s(cachep, SIZE_L3);
-				g_cpucache_up = PARTIAL_L3;
-			} else {
-				int node;
-				for_each_online_node(node) {
-
-					cachep->nodelists[node] =
-					    kmalloc_node(sizeof
-							 (struct kmem_list3),
-							 GFP_KERNEL, node);
-					BUG_ON(!cachep->nodelists[node]);
-					kmem_list3_init(cachep->
-							nodelists[node]);
-				}
-			}
-		}
-		cachep->nodelists[numa_node_id()]->next_reap =
-		    jiffies + REAPTIMEOUT_LIST3 +
-		    ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
-
-		BUG_ON(!cpu_cache_get(cachep));
-		cpu_cache_get(cachep)->avail = 0;
-		cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
-		cpu_cache_get(cachep)->batchcount = 1;
-		cpu_cache_get(cachep)->touched = 0;
-		cachep->batchcount = 1;
-		cachep->limit = BOOT_CPUCACHE_ENTRIES;
-	}
+	setup_cpu_cache(cachep);
 
 	/* cache setup completed, link it into the list */
 	list_add(&cachep->next, &cache_chain);
-      oops:
+oops:
 	if (!cachep && (flags & SLAB_PANIC))
 		panic("kmem_cache_create(): failed to create slab `%s'\n",
 		      name);
@@ -2089,30 +2123,13 @@ static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node)
 #define check_spinlock_acquired_node(x, y) do { } while(0)
 #endif
 
-/*
- * Waits for all CPUs to execute func().
- */
-static void smp_call_function_all_cpus(void (*func)(void *arg), void *arg)
-{
-	check_irq_on();
-	preempt_disable();
-
-	local_irq_disable();
-	func(arg);
-	local_irq_enable();
-
-	if (smp_call_function(func, arg, 1, 1))
-		BUG();
-
-	preempt_enable();
-}
-
-static void drain_array_locked(struct kmem_cache *cachep, struct array_cache *ac,
-				int force, int node);
+static void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
+			struct array_cache *ac,
+			int force, int node);
 
 static void do_drain(void *arg)
 {
-	struct kmem_cache *cachep = (struct kmem_cache *) arg;
+	struct kmem_cache *cachep = arg;
 	struct array_cache *ac;
 	int node = numa_node_id();
 
@@ -2129,14 +2146,12 @@ static void drain_cpu_caches(struct kmem_cache *cachep)
 	struct kmem_list3 *l3;
 	int node;
 
-	smp_call_function_all_cpus(do_drain, cachep);
+	on_each_cpu(do_drain, cachep, 1, 1);
 	check_irq_on();
 	for_each_online_node(node) {
 		l3 = cachep->nodelists[node];
 		if (l3) {
-			spin_lock_irq(&l3->list_lock);
-			drain_array_locked(cachep, l3->shared, 1, node);
-			spin_unlock_irq(&l3->list_lock);
+			drain_array(cachep, l3, l3->shared, 1, node);
 			if (l3->alien)
 				drain_alien_cache(cachep, l3->alien);
 		}
@@ -2260,16 +2275,15 @@ int kmem_cache_destroy(struct kmem_cache *cachep)
 
 	/* NUMA: free the list3 structures */
 	for_each_online_node(i) {
-		if ((l3 = cachep->nodelists[i])) {
+		l3 = cachep->nodelists[i];
+		if (l3) {
 			kfree(l3->shared);
 			free_alien_cache(l3->alien);
 			kfree(l3);
 		}
 	}
 	kmem_cache_free(&cache_cache, cachep);
-
 	unlock_cpu_hotplug();
-
 	return 0;
 }
 EXPORT_SYMBOL(kmem_cache_destroy);
@@ -2292,7 +2306,6 @@ static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp,
 	slabp->inuse = 0;
 	slabp->colouroff = colour_off;
 	slabp->s_mem = objp + colour_off;
-
 	return slabp;
 }
 
@@ -2307,7 +2320,7 @@ static void cache_init_objs(struct kmem_cache *cachep,
 	int i;
 
 	for (i = 0; i < cachep->num; i++) {
-		void *objp = slabp->s_mem + cachep->buffer_size * i;
+		void *objp = index_to_obj(cachep, slabp, i);
 #if DEBUG
 		/* need to poison the objs? */
 		if (cachep->flags & SLAB_POISON)
@@ -2320,9 +2333,9 @@ static void cache_init_objs(struct kmem_cache *cachep,
 			*dbg_redzone2(cachep, objp) = RED_INACTIVE;
 		}
 		/*
-		 * Constructors are not allowed to allocate memory from
-		 * the same cache which they are a constructor for.
-		 * Otherwise, deadlock. They must also be threaded.
+		 * Constructors are not allowed to allocate memory from the same
+		 * cache which they are a constructor for.  Otherwise, deadlock.
+		 * They must also be threaded.
 		 */
 		if (cachep->ctor && !(cachep->flags & SLAB_POISON))
 			cachep->ctor(objp + obj_offset(cachep), cachep,
@@ -2336,8 +2349,8 @@ static void cache_init_objs(struct kmem_cache *cachep,
 				slab_error(cachep, "constructor overwrote the"
 					   " start of an object");
 		}
-		if ((cachep->buffer_size % PAGE_SIZE) == 0 && OFF_SLAB(cachep)
-		    && cachep->flags & SLAB_POISON)
+		if ((cachep->buffer_size % PAGE_SIZE) == 0 &&
+			    OFF_SLAB(cachep) && cachep->flags & SLAB_POISON)
 			kernel_map_pages(virt_to_page(objp),
 					 cachep->buffer_size / PAGE_SIZE, 0);
 #else
@@ -2352,18 +2365,16 @@ static void cache_init_objs(struct kmem_cache *cachep,
 
 static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags)
 {
-	if (flags & SLAB_DMA) {
-		if (!(cachep->gfpflags & GFP_DMA))
-			BUG();
-	} else {
-		if (cachep->gfpflags & GFP_DMA)
-			BUG();
-	}
+	if (flags & SLAB_DMA)
+		BUG_ON(!(cachep->gfpflags & GFP_DMA));
+	else
+		BUG_ON(cachep->gfpflags & GFP_DMA);
 }
 
-static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slabp, int nodeid)
+static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slabp,
+				int nodeid)
 {
-	void *objp = slabp->s_mem + (slabp->free * cachep->buffer_size);
+	void *objp = index_to_obj(cachep, slabp, slabp->free);
 	kmem_bufctl_t next;
 
 	slabp->inuse++;
@@ -2377,10 +2388,10 @@ static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slabp, int nod
 	return objp;
 }
 
-static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp, void *objp,
-			  int nodeid)
+static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp,
+				void *objp, int nodeid)
 {
-	unsigned int objnr = (unsigned)(objp-slabp->s_mem) / cachep->buffer_size;
+	unsigned int objnr = obj_to_index(cachep, slabp, objp);
 
 #if DEBUG
 	/* Verify that the slab belongs to the intended node */
@@ -2388,7 +2399,7 @@ static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp, void *ob
 
 	if (slab_bufctl(slabp)[objnr] != BUFCTL_FREE) {
 		printk(KERN_ERR "slab: double free detected in cache "
-		       "'%s', objp %p\n", cachep->name, objp);
+				"'%s', objp %p\n", cachep->name, objp);
 		BUG();
 	}
 #endif
@@ -2397,14 +2408,18 @@ static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp, void *ob
 	slabp->inuse--;
 }
 
-static void set_slab_attr(struct kmem_cache *cachep, struct slab *slabp, void *objp)
+static void set_slab_attr(struct kmem_cache *cachep, struct slab *slabp,
+			void *objp)
 {
 	int i;
 	struct page *page;
 
 	/* Nasty!!!!!! I hope this is OK. */
-	i = 1 << cachep->gfporder;
 	page = virt_to_page(objp);
+
+	i = 1;
+	if (likely(!PageCompound(page)))
+		i <<= cachep->gfporder;
 	do {
 		page_set_cache(page, cachep);
 		page_set_slab(page, slabp);
@@ -2425,8 +2440,9 @@ static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid)
 	unsigned long ctor_flags;
 	struct kmem_list3 *l3;
 
-	/* Be lazy and only check for valid flags here,
-	 * keeping it out of the critical path in kmem_cache_alloc().
+	/*
+	 * Be lazy and only check for valid flags here,  keeping it out of the
+	 * critical path in kmem_cache_alloc().
 	 */
 	if (flags & ~(SLAB_DMA | SLAB_LEVEL_MASK | SLAB_NO_GROW))
 		BUG();
@@ -2467,14 +2483,17 @@ static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid)
 	 */
 	kmem_flagcheck(cachep, flags);
 
-	/* Get mem for the objs.
-	 * Attempt to allocate a physical page from 'nodeid',
+	/*
+	 * Get mem for the objs.  Attempt to allocate a physical page from
+	 * 'nodeid'.
 	 */
-	if (!(objp = kmem_getpages(cachep, flags, nodeid)))
+	objp = kmem_getpages(cachep, flags, nodeid);
+	if (!objp)
 		goto failed;
 
 	/* Get slab management. */
-	if (!(slabp = alloc_slabmgmt(cachep, objp, offset, local_flags)))
+	slabp = alloc_slabmgmt(cachep, objp, offset, local_flags);
+	if (!slabp)
 		goto opps1;
 
 	slabp->nodeid = nodeid;
@@ -2493,9 +2512,9 @@ static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid)
 	l3->free_objects += cachep->num;
 	spin_unlock(&l3->list_lock);
 	return 1;
-      opps1:
+opps1:
 	kmem_freepages(cachep, objp);
-      failed:
+failed:
 	if (local_flags & __GFP_WAIT)
 		local_irq_disable();
 	return 0;
@@ -2538,8 +2557,8 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
 	page = virt_to_page(objp);
 
 	if (page_get_cache(page) != cachep) {
-		printk(KERN_ERR
-		       "mismatch in kmem_cache_free: expected cache %p, got %p\n",
+		printk(KERN_ERR "mismatch in kmem_cache_free: expected "
+				"cache %p, got %p\n",
 		       page_get_cache(page), cachep);
 		printk(KERN_ERR "%p is %s.\n", cachep, cachep->name);
 		printk(KERN_ERR "%p is %s.\n", page_get_cache(page),
@@ -2549,13 +2568,12 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
 	slabp = page_get_slab(page);
 
 	if (cachep->flags & SLAB_RED_ZONE) {
-		if (*dbg_redzone1(cachep, objp) != RED_ACTIVE
-		    || *dbg_redzone2(cachep, objp) != RED_ACTIVE) {
-			slab_error(cachep,
-				   "double free, or memory outside"
-				   " object was overwritten");
-			printk(KERN_ERR
-			       "%p: redzone 1: 0x%lx, redzone 2: 0x%lx.\n",
+		if (*dbg_redzone1(cachep, objp) != RED_ACTIVE ||
+				*dbg_redzone2(cachep, objp) != RED_ACTIVE) {
+			slab_error(cachep, "double free, or memory outside"
+						" object was overwritten");
+			printk(KERN_ERR "%p: redzone 1:0x%lx, "
+					"redzone 2:0x%lx.\n",
 			       objp, *dbg_redzone1(cachep, objp),
 			       *dbg_redzone2(cachep, objp));
 		}
@@ -2565,15 +2583,16 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
 	if (cachep->flags & SLAB_STORE_USER)
 		*dbg_userword(cachep, objp) = caller;
 
-	objnr = (unsigned)(objp - slabp->s_mem) / cachep->buffer_size;
+	objnr = obj_to_index(cachep, slabp, objp);
 
 	BUG_ON(objnr >= cachep->num);
-	BUG_ON(objp != slabp->s_mem + objnr * cachep->buffer_size);
+	BUG_ON(objp != index_to_obj(cachep, slabp, objnr));
 
 	if (cachep->flags & SLAB_DEBUG_INITIAL) {
-		/* Need to call the slab's constructor so the
-		 * caller can perform a verify of its state (debugging).
-		 * Called without the cache-lock held.
+		/*
+		 * Need to call the slab's constructor so the caller can
+		 * perform a verify of its state (debugging).  Called without
+		 * the cache-lock held.
 		 */
 		cachep->ctor(objp + obj_offset(cachep),
 			     cachep, SLAB_CTOR_CONSTRUCTOR | SLAB_CTOR_VERIFY);
@@ -2586,7 +2605,7 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
 	}
 	if (cachep->flags & SLAB_POISON) {
 #ifdef CONFIG_DEBUG_PAGEALLOC
-		if ((cachep->buffer_size % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) {
+		if ((cachep->buffer_size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) {
 			store_stackinfo(cachep, objp, (unsigned long)caller);
 			kernel_map_pages(virt_to_page(objp),
 					 cachep->buffer_size / PAGE_SIZE, 0);
@@ -2612,14 +2631,14 @@ static void check_slabp(struct kmem_cache *cachep, struct slab *slabp)
 			goto bad;
 	}
 	if (entries != cachep->num - slabp->inuse) {
-	      bad:
-		printk(KERN_ERR
-		       "slab: Internal list corruption detected in cache '%s'(%d), slabp %p(%d). Hexdump:\n",
-		       cachep->name, cachep->num, slabp, slabp->inuse);
+bad:
+		printk(KERN_ERR "slab: Internal list corruption detected in "
+				"cache '%s'(%d), slabp %p(%d). Hexdump:\n",
+			cachep->name, cachep->num, slabp, slabp->inuse);
 		for (i = 0;
 		     i < sizeof(*slabp) + cachep->num * sizeof(kmem_bufctl_t);
 		     i++) {
-			if ((i % 16) == 0)
+			if (i % 16 == 0)
 				printk("\n%03x:", i);
 			printk(" %02x", ((unsigned char *)slabp)[i]);
 		}
@@ -2641,12 +2660,13 @@ static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
 
 	check_irq_off();
 	ac = cpu_cache_get(cachep);
-      retry:
+retry:
 	batchcount = ac->batchcount;
 	if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
-		/* if there was little recent activity on this
-		 * cache, then perform only a partial refill.
-		 * Otherwise we could generate refill bouncing.
+		/*
+		 * If there was little recent activity on this cache, then
+		 * perform only a partial refill.  Otherwise we could generate
+		 * refill bouncing.
 		 */
 		batchcount = BATCHREFILL_LIMIT;
 	}
@@ -2702,29 +2722,29 @@ static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
 			list_add(&slabp->list, &l3->slabs_partial);
 	}
 
-      must_grow:
+must_grow:
 	l3->free_objects -= ac->avail;
-      alloc_done:
+alloc_done:
 	spin_unlock(&l3->list_lock);
 
 	if (unlikely(!ac->avail)) {
 		int x;
 		x = cache_grow(cachep, flags, numa_node_id());
 
-		// cache_grow can reenable interrupts, then ac could change.
+		/* cache_grow can reenable interrupts, then ac could change. */
 		ac = cpu_cache_get(cachep);
-		if (!x && ac->avail == 0)	// no objects in sight? abort
+		if (!x && ac->avail == 0)	/* no objects in sight? abort */
 			return NULL;
 
-		if (!ac->avail)	// objects refilled by interrupt?
+		if (!ac->avail)		/* objects refilled by interrupt? */
 			goto retry;
 	}
 	ac->touched = 1;
 	return ac->entry[--ac->avail];
 }
 
-static inline void
-cache_alloc_debugcheck_before(struct kmem_cache *cachep, gfp_t flags)
+static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep,
+						gfp_t flags)
 {
 	might_sleep_if(flags & __GFP_WAIT);
 #if DEBUG
@@ -2733,8 +2753,8 @@ cache_alloc_debugcheck_before(struct kmem_cache *cachep, gfp_t flags)
 }
 
 #if DEBUG
-static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, gfp_t flags,
-					void *objp, void *caller)
+static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
+				gfp_t flags, void *objp, void *caller)
 {
 	if (!objp)
 		return objp;
@@ -2754,15 +2774,14 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, gfp_t flags
 		*dbg_userword(cachep, objp) = caller;
 
 	if (cachep->flags & SLAB_RED_ZONE) {
-		if (*dbg_redzone1(cachep, objp) != RED_INACTIVE
-		    || *dbg_redzone2(cachep, objp) != RED_INACTIVE) {
-			slab_error(cachep,
-				   "double free, or memory outside"
-				   " object was overwritten");
+		if (*dbg_redzone1(cachep, objp) != RED_INACTIVE ||
+				*dbg_redzone2(cachep, objp) != RED_INACTIVE) {
+			slab_error(cachep, "double free, or memory outside"
+						" object was overwritten");
 			printk(KERN_ERR
-			       "%p: redzone 1: 0x%lx, redzone 2: 0x%lx.\n",
-			       objp, *dbg_redzone1(cachep, objp),
-			       *dbg_redzone2(cachep, objp));
+				"%p: redzone 1:0x%lx, redzone 2:0x%lx\n",
+				objp, *dbg_redzone1(cachep, objp),
+				*dbg_redzone2(cachep, objp));
 		}
 		*dbg_redzone1(cachep, objp) = RED_ACTIVE;
 		*dbg_redzone2(cachep, objp) = RED_ACTIVE;
@@ -2809,8 +2828,8 @@ static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
 	return objp;
 }
 
-static __always_inline void *
-__cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller)
+static __always_inline void *__cache_alloc(struct kmem_cache *cachep,
+						gfp_t flags, void *caller)
 {
 	unsigned long save_flags;
 	void *objp;
@@ -2830,7 +2849,8 @@ __cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller)
 /*
  * A interface to enable slab creation on nodeid
  */
-static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
+static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
+				int nodeid)
 {
 	struct list_head *entry;
 	struct slab *slabp;
@@ -2841,7 +2861,7 @@ static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int node
 	l3 = cachep->nodelists[nodeid];
 	BUG_ON(!l3);
 
-      retry:
+retry:
 	check_irq_off();
 	spin_lock(&l3->list_lock);
 	entry = l3->slabs_partial.next;
@@ -2868,16 +2888,15 @@ static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int node
 	/* move slabp to correct slabp list: */
 	list_del(&slabp->list);
 
-	if (slabp->free == BUFCTL_END) {
+	if (slabp->free == BUFCTL_END)
 		list_add(&slabp->list, &l3->slabs_full);
-	} else {
+	else
 		list_add(&slabp->list, &l3->slabs_partial);
-	}
 
 	spin_unlock(&l3->list_lock);
 	goto done;
 
-      must_grow:
+must_grow:
 	spin_unlock(&l3->list_lock);
 	x = cache_grow(cachep, flags, nodeid);
 
@@ -2885,7 +2904,7 @@ static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int node
 		return NULL;
 
 	goto retry;
-      done:
+done:
 	return obj;
 }
 #endif
@@ -2958,7 +2977,7 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
 	}
 
 	free_block(cachep, ac->entry, batchcount, node);
-      free_done:
+free_done:
 #if STATS
 	{
 		int i = 0;
@@ -2979,16 +2998,12 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
 #endif
 	spin_unlock(&l3->list_lock);
 	ac->avail -= batchcount;
-	memmove(ac->entry, &(ac->entry[batchcount]),
-		sizeof(void *) * ac->avail);
+	memmove(ac->entry, &(ac->entry[batchcount]), sizeof(void *)*ac->avail);
 }
 
 /*
- * __cache_free
- * Release an obj back to its cache. If the obj has a constructed
- * state, it must be in this state _before_ it is released.
- *
- * Called with disabled ints.
+ * Release an obj back to its cache. If the obj has a constructed state, it must
+ * be in this state _before_ it is released.  Called with disabled ints.
  */
 static inline void __cache_free(struct kmem_cache *cachep, void *objp)
 {
@@ -3007,9 +3022,9 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp)
 		if (unlikely(slabp->nodeid != numa_node_id())) {
 			struct array_cache *alien = NULL;
 			int nodeid = slabp->nodeid;
-			struct kmem_list3 *l3 =
-			    cachep->nodelists[numa_node_id()];
+			struct kmem_list3 *l3;
 
+			l3 = cachep->nodelists[numa_node_id()];
 			STATS_INC_NODEFREES(cachep);
 			if (l3->alien && l3->alien[nodeid]) {
 				alien = l3->alien[nodeid];
@@ -3093,7 +3108,7 @@ int fastcall kmem_ptr_validate(struct kmem_cache *cachep, void *ptr)
 	if (unlikely(page_get_cache(page) != cachep))
 		goto out;
 	return 1;
-      out:
+out:
 	return 0;
 }
 
@@ -3119,7 +3134,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
 	local_irq_save(save_flags);
 
 	if (nodeid == -1 || nodeid == numa_node_id() ||
-	    !cachep->nodelists[nodeid])
+			!cachep->nodelists[nodeid])
 		ptr = ____cache_alloc(cachep, flags);
 	else
 		ptr = __cache_alloc_node(cachep, flags, nodeid);
@@ -3148,6 +3163,7 @@ EXPORT_SYMBOL(kmalloc_node);
  * kmalloc - allocate memory
  * @size: how many bytes of memory are required.
  * @flags: the type of memory to allocate.
+ * @caller: function caller for debug tracking of the caller
  *
  * kmalloc is the normal method of allocating memory
  * in the kernel.
@@ -3236,7 +3252,7 @@ void *__alloc_percpu(size_t size)
 	/* Catch derefs w/o wrappers */
 	return (void *)(~(unsigned long)pdata);
 
-      unwind_oom:
+unwind_oom:
 	while (--i >= 0) {
 		if (!cpu_possible(i))
 			continue;
@@ -3339,18 +3355,20 @@ static int alloc_kmemlist(struct kmem_cache *cachep)
 		struct array_cache *nc = NULL, *new;
 		struct array_cache **new_alien = NULL;
 #ifdef CONFIG_NUMA
-		if (!(new_alien = alloc_alien_cache(node, cachep->limit)))
+		new_alien = alloc_alien_cache(node, cachep->limit);
+		if (!new_alien)
 			goto fail;
 #endif
-		if (!(new = alloc_arraycache(node, (cachep->shared *
-						    cachep->batchcount),
-					     0xbaadf00d)))
+		new = alloc_arraycache(node, cachep->shared*cachep->batchcount,
+					0xbaadf00d);
+		if (!new)
 			goto fail;
-		if ((l3 = cachep->nodelists[node])) {
-
+		l3 = cachep->nodelists[node];
+		if (l3) {
 			spin_lock_irq(&l3->list_lock);
 
-			if ((nc = cachep->nodelists[node]->shared))
+			nc = cachep->nodelists[node]->shared;
+			if (nc)
 				free_block(cachep, nc->entry, nc->avail, node);
 
 			l3->shared = new;
@@ -3359,27 +3377,27 @@ static int alloc_kmemlist(struct kmem_cache *cachep)
 				new_alien = NULL;
 			}
 			l3->free_limit = (1 + nr_cpus_node(node)) *
-			    cachep->batchcount + cachep->num;
+					cachep->batchcount + cachep->num;
 			spin_unlock_irq(&l3->list_lock);
 			kfree(nc);
 			free_alien_cache(new_alien);
 			continue;
 		}
-		if (!(l3 = kmalloc_node(sizeof(struct kmem_list3),
-					GFP_KERNEL, node)))
+		l3 = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, node);
+		if (!l3)
 			goto fail;
 
 		kmem_list3_init(l3);
 		l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
-		    ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
+				((unsigned long)cachep) % REAPTIMEOUT_LIST3;
 		l3->shared = new;
 		l3->alien = new_alien;
 		l3->free_limit = (1 + nr_cpus_node(node)) *
-		    cachep->batchcount + cachep->num;
+					cachep->batchcount + cachep->num;
 		cachep->nodelists[node] = l3;
 	}
 	return err;
-      fail:
+fail:
 	err = -ENOMEM;
 	return err;
 }
@@ -3391,7 +3409,7 @@ struct ccupdate_struct {
 
 static void do_ccupdate_local(void *info)
 {
-	struct ccupdate_struct *new = (struct ccupdate_struct *)info;
+	struct ccupdate_struct *new = info;
 	struct array_cache *old;
 
 	check_irq_off();
@@ -3401,16 +3419,17 @@ static void do_ccupdate_local(void *info)
 	new->new[smp_processor_id()] = old;
 }
 
-static int do_tune_cpucache(struct kmem_cache *cachep, int limit, int batchcount,
-			    int shared)
+/* Always called with the cache_chain_mutex held */
+static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
+				int batchcount, int shared)
 {
 	struct ccupdate_struct new;
 	int i, err;
 
 	memset(&new.new, 0, sizeof(new.new));
 	for_each_online_cpu(i) {
-		new.new[i] =
-		    alloc_arraycache(cpu_to_node(i), limit, batchcount);
+		new.new[i] = alloc_arraycache(cpu_to_node(i), limit,
+						batchcount);
 		if (!new.new[i]) {
 			for (i--; i >= 0; i--)
 				kfree(new.new[i]);
@@ -3419,14 +3438,12 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, int batchcount
 	}
 	new.cachep = cachep;
 
-	smp_call_function_all_cpus(do_ccupdate_local, (void *)&new);
+	on_each_cpu(do_ccupdate_local, (void *)&new, 1, 1);
 
 	check_irq_on();
-	spin_lock(&cachep->spinlock);
 	cachep->batchcount = batchcount;
 	cachep->limit = limit;
 	cachep->shared = shared;
-	spin_unlock(&cachep->spinlock);
 
 	for_each_online_cpu(i) {
 		struct array_cache *ccold = new.new[i];
@@ -3447,15 +3464,17 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, int batchcount
 	return 0;
 }
 
+/* Called with cache_chain_mutex held always */
 static void enable_cpucache(struct kmem_cache *cachep)
 {
 	int err;
 	int limit, shared;
 
-	/* The head array serves three purposes:
+	/*
+	 * The head array serves three purposes:
 	 * - create a LIFO ordering, i.e. return objects that are cache-warm
 	 * - reduce the number of spinlock operations.
-	 * - reduce the number of linked list operations on the slab and 
+	 * - reduce the number of linked list operations on the slab and
 	 *   bufctl chains: array operations are cheaper.
 	 * The numbers are guessed, we should auto-tune as described by
 	 * Bonwick.
@@ -3471,7 +3490,8 @@ static void enable_cpucache(struct kmem_cache *cachep)
 	else
 		limit = 120;
 
-	/* Cpu bound tasks (e.g. network routing) can exhibit cpu bound
+	/*
+	 * CPU bound tasks (e.g. network routing) can exhibit cpu bound
 	 * allocation behaviour: Most allocs on one cpu, most free operations
 	 * on another cpu. For these cases, an efficient object passing between
 	 * cpus is necessary. This is provided by a shared array. The array
@@ -3486,9 +3506,9 @@ static void enable_cpucache(struct kmem_cache *cachep)
 #endif
 
 #if DEBUG
-	/* With debugging enabled, large batchcount lead to excessively
-	 * long periods with disabled local interrupts. Limit the 
-	 * batchcount
+	/*
+	 * With debugging enabled, large batchcount lead to excessively long
+	 * periods with disabled local interrupts. Limit the batchcount
 	 */
 	if (limit > 32)
 		limit = 32;
@@ -3499,23 +3519,32 @@ static void enable_cpucache(struct kmem_cache *cachep)
 		       cachep->name, -err);
 }
 
-static void drain_array_locked(struct kmem_cache *cachep, struct array_cache *ac,
-				int force, int node)
+/*
+ * Drain an array if it contains any elements taking the l3 lock only if
+ * necessary. Note that the l3 listlock also protects the array_cache
+ * if drain_array() is used on the shared array.
+ */
+void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
+			 struct array_cache *ac, int force, int node)
 {
 	int tofree;
 
-	check_spinlock_acquired_node(cachep, node);
+	if (!ac || !ac->avail)
+		return;
 	if (ac->touched && !force) {
 		ac->touched = 0;
-	} else if (ac->avail) {
-		tofree = force ? ac->avail : (ac->limit + 4) / 5;
-		if (tofree > ac->avail) {
-			tofree = (ac->avail + 1) / 2;
+	} else {
+		spin_lock_irq(&l3->list_lock);
+		if (ac->avail) {
+			tofree = force ? ac->avail : (ac->limit + 4) / 5;
+			if (tofree > ac->avail)
+				tofree = (ac->avail + 1) / 2;
+			free_block(cachep, ac->entry, tofree, node);
+			ac->avail -= tofree;
+			memmove(ac->entry, &(ac->entry[tofree]),
+				sizeof(void *) * ac->avail);
 		}
-		free_block(cachep, ac->entry, tofree, node);
-		ac->avail -= tofree;
-		memmove(ac->entry, &(ac->entry[tofree]),
-			sizeof(void *) * ac->avail);
+		spin_unlock_irq(&l3->list_lock);
 	}
 }
 
@@ -3528,13 +3557,14 @@ static void drain_array_locked(struct kmem_cache *cachep, struct array_cache *ac
  * - clear the per-cpu caches for this CPU.
  * - return freeable pages to the main free memory pool.
  *
- * If we cannot acquire the cache chain mutex then just give up - we'll
- * try again on the next iteration.
+ * If we cannot acquire the cache chain mutex then just give up - we'll try
+ * again on the next iteration.
  */
 static void cache_reap(void *unused)
 {
 	struct list_head *walk;
 	struct kmem_list3 *l3;
+	int node = numa_node_id();
 
 	if (!mutex_trylock(&cache_chain_mutex)) {
 		/* Give up. Setup the next iteration. */
@@ -3550,65 +3580,72 @@ static void cache_reap(void *unused)
 		struct slab *slabp;
 
 		searchp = list_entry(walk, struct kmem_cache, next);
-
-		if (searchp->flags & SLAB_NO_REAP)
-			goto next;
-
 		check_irq_on();
 
-		l3 = searchp->nodelists[numa_node_id()];
+		/*
+		 * We only take the l3 lock if absolutely necessary and we
+		 * have established with reasonable certainty that
+		 * we can do some work if the lock was obtained.
+		 */
+		l3 = searchp->nodelists[node];
+
 		reap_alien(searchp, l3);
-		spin_lock_irq(&l3->list_lock);
 
-		drain_array_locked(searchp, cpu_cache_get(searchp), 0,
-				   numa_node_id());
+		drain_array(searchp, l3, cpu_cache_get(searchp), 0, node);
 
+		/*
+		 * These are racy checks but it does not matter
+		 * if we skip one check or scan twice.
+		 */
 		if (time_after(l3->next_reap, jiffies))
-			goto next_unlock;
+			goto next;
 
 		l3->next_reap = jiffies + REAPTIMEOUT_LIST3;
 
-		if (l3->shared)
-			drain_array_locked(searchp, l3->shared, 0,
-					   numa_node_id());
+		drain_array(searchp, l3, l3->shared, 0, node);
 
 		if (l3->free_touched) {
 			l3->free_touched = 0;
-			goto next_unlock;
+			goto next;
 		}
 
-		tofree =
-		    (l3->free_limit + 5 * searchp->num -
-		     1) / (5 * searchp->num);
+		tofree = (l3->free_limit + 5 * searchp->num - 1) /
+				(5 * searchp->num);
 		do {
+			/*
+			 * Do not lock if there are no free blocks.
+			 */
+			if (list_empty(&l3->slabs_free))
+				break;
+
+			spin_lock_irq(&l3->list_lock);
 			p = l3->slabs_free.next;
-			if (p == &(l3->slabs_free))
+			if (p == &(l3->slabs_free)) {
+				spin_unlock_irq(&l3->list_lock);
 				break;
+			}
 
 			slabp = list_entry(p, struct slab, list);
 			BUG_ON(slabp->inuse);
 			list_del(&slabp->list);
 			STATS_INC_REAPED(searchp);
 
-			/* Safe to drop the lock. The slab is no longer
-			 * linked to the cache.
-			 * searchp cannot disappear, we hold
+			/*
+			 * Safe to drop the lock. The slab is no longer linked
+			 * to the cache. searchp cannot disappear, we hold
 			 * cache_chain_lock
 			 */
 			l3->free_objects -= searchp->num;
 			spin_unlock_irq(&l3->list_lock);
 			slab_destroy(searchp, slabp);
-			spin_lock_irq(&l3->list_lock);
 		} while (--tofree > 0);
-	      next_unlock:
-		spin_unlock_irq(&l3->list_lock);
-	      next:
+next:
 		cond_resched();
 	}
 	check_irq_on();
 	mutex_unlock(&cache_chain_mutex);
 	next_reap_node();
-	/* Setup the next iteration */
+	/* Set up the next iteration */
 	schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC);
 }
 
@@ -3658,8 +3695,8 @@ static void *s_next(struct seq_file *m, void *p, loff_t *pos)
 {
 	struct kmem_cache *cachep = p;
 	++*pos;
-	return cachep->next.next == &cache_chain ? NULL
-	    : list_entry(cachep->next.next, struct kmem_cache, next);
+	return cachep->next.next == &cache_chain ?
+		NULL : list_entry(cachep->next.next, struct kmem_cache, next);
 }
 
 static void s_stop(struct seq_file *m, void *p)
@@ -3681,7 +3718,6 @@ static int s_show(struct seq_file *m, void *p)
 	int node;
 	struct kmem_list3 *l3;
 
-	spin_lock(&cachep->spinlock);
 	active_objs = 0;
 	num_slabs = 0;
 	for_each_online_node(node) {
@@ -3748,7 +3784,9 @@ static int s_show(struct seq_file *m, void *p)
 		unsigned long node_frees = cachep->node_frees;
 
 		seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu \
-				%4lu %4lu %4lu %4lu", allocs, high, grown, reaped, errors, max_freeable, node_allocs, node_frees);
+				%4lu %4lu %4lu %4lu", allocs, high, grown,
+				reaped, errors, max_freeable, node_allocs,
+				node_frees);
 	}
 	/* cpu stats */
 	{
@@ -3762,7 +3800,6 @@ static int s_show(struct seq_file *m, void *p)
 	}
 #endif
 	seq_putc(m, '\n');
-	spin_unlock(&cachep->spinlock);
 	return 0;
 }
 
@@ -3820,13 +3857,12 @@ ssize_t slabinfo_write(struct file *file, const char __user * buffer,
 	mutex_lock(&cache_chain_mutex);
 	res = -EINVAL;
 	list_for_each(p, &cache_chain) {
-		struct kmem_cache *cachep = list_entry(p, struct kmem_cache,
-						       next);
+		struct kmem_cache *cachep;
 
+		cachep = list_entry(p, struct kmem_cache, next);
 		if (!strcmp(cachep->name, kbuf)) {
-			if (limit < 1 ||
-			    batchcount < 1 ||
-			    batchcount > limit || shared < 0) {
+			if (limit < 1 || batchcount < 1 ||
+					batchcount > limit || shared < 0) {
 				res = 0;
 			} else {
 				res = do_tune_cpucache(cachep, limit,
diff --git a/mm/swap.c b/mm/swap.c
index b524ea90bdd..91b7e2026f6 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -209,19 +209,18 @@ int lru_add_drain_all(void)
  */
 void fastcall __page_cache_release(struct page *page)
 {
-	unsigned long flags;
-	struct zone *zone = page_zone(page);
+	if (PageLRU(page)) {
+		unsigned long flags;
+		struct zone *zone = page_zone(page);
 
-	spin_lock_irqsave(&zone->lru_lock, flags);
-	if (TestClearPageLRU(page))
+		spin_lock_irqsave(&zone->lru_lock, flags);
+		BUG_ON(!PageLRU(page));
+		__ClearPageLRU(page);
 		del_page_from_lru(zone, page);
-	if (page_count(page) != 0)
-		page = NULL;
-	spin_unlock_irqrestore(&zone->lru_lock, flags);
-	if (page)
-		free_hot_page(page);
+		spin_unlock_irqrestore(&zone->lru_lock, flags);
+	}
+	free_hot_page(page);
 }
-
 EXPORT_SYMBOL(__page_cache_release);
 
 /*
@@ -245,7 +244,6 @@ void release_pages(struct page **pages, int nr, int cold)
 	pagevec_init(&pages_to_free, cold);
 	for (i = 0; i < nr; i++) {
 		struct page *page = pages[i];
-		struct zone *pagezone;
 
 		if (unlikely(PageCompound(page))) {
 			if (zone) {
@@ -259,23 +257,27 @@ void release_pages(struct page **pages, int nr, int cold)
 		if (!put_page_testzero(page))
 			continue;
 
-		pagezone = page_zone(page);
-		if (pagezone != zone) {
-			if (zone)
-				spin_unlock_irq(&zone->lru_lock);
-			zone = pagezone;
-			spin_lock_irq(&zone->lru_lock);
-		}
-		if (TestClearPageLRU(page))
+		if (PageLRU(page)) {
+			struct zone *pagezone = page_zone(page);
+			if (pagezone != zone) {
+				if (zone)
+					spin_unlock_irq(&zone->lru_lock);
+				zone = pagezone;
+				spin_lock_irq(&zone->lru_lock);
+			}
+			BUG_ON(!PageLRU(page));
+			__ClearPageLRU(page);
 			del_page_from_lru(zone, page);
-		if (page_count(page) == 0) {
-			if (!pagevec_add(&pages_to_free, page)) {
+		}
+
+		if (!pagevec_add(&pages_to_free, page)) {
+			if (zone) {
 				spin_unlock_irq(&zone->lru_lock);
-				__pagevec_free(&pages_to_free);
-				pagevec_reinit(&pages_to_free);
-				zone = NULL;	/* No lock is held */
+				zone = NULL;
 			}
-		}
+			__pagevec_free(&pages_to_free);
+			pagevec_reinit(&pages_to_free);
+  		}
 	}
 	if (zone)
 		spin_unlock_irq(&zone->lru_lock);
@@ -343,8 +345,8 @@ void __pagevec_lru_add(struct pagevec *pvec)
 			zone = pagezone;
 			spin_lock_irq(&zone->lru_lock);
 		}
-		if (TestSetPageLRU(page))
-			BUG();
+		BUG_ON(PageLRU(page));
+		SetPageLRU(page);
 		add_page_to_inactive_list(zone, page);
 	}
 	if (zone)
@@ -370,10 +372,10 @@ void __pagevec_lru_add_active(struct pagevec *pvec)
 			zone = pagezone;
 			spin_lock_irq(&zone->lru_lock);
 		}
-		if (TestSetPageLRU(page))
-			BUG();
-		if (TestSetPageActive(page))
-			BUG();
+		BUG_ON(PageLRU(page));
+		SetPageLRU(page);
+		BUG_ON(PageActive(page));
+		SetPageActive(page);
 		add_page_to_active_list(zone, page);
 	}
 	if (zone)
diff --git a/mm/swap_state.c b/mm/swap_state.c
index db8a3d3e163..d7af296833f 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -15,6 +15,7 @@
 #include <linux/buffer_head.h>
 #include <linux/backing-dev.h>
 #include <linux/pagevec.h>
+#include <linux/migrate.h>
 
 #include <asm/pgtable.h>
 
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 1f9cf0d073b..365ed6ff182 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -116,7 +116,7 @@ static inline unsigned long scan_swap_map(struct swap_info_struct *si)
 				last_in_cluster = offset + SWAPFILE_CLUSTER;
 			else if (offset == last_in_cluster) {
 				spin_lock(&swap_lock);
-				si->cluster_next = offset-SWAPFILE_CLUSTER-1;
+				si->cluster_next = offset-SWAPFILE_CLUSTER+1;
 				goto cluster;
 			}
 			if (unlikely(--latency_ration < 0)) {
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 4fe7e3aa02e..fd572bbdc9f 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -33,39 +33,21 @@
 #include <linux/cpuset.h>
 #include <linux/notifier.h>
 #include <linux/rwsem.h>
+#include <linux/delay.h>
 
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
 
 #include <linux/swapops.h>
 
-/* possible outcome of pageout() */
-typedef enum {
-	/* failed to write page out, page is locked */
-	PAGE_KEEP,
-	/* move page to the active list, page is locked */
-	PAGE_ACTIVATE,
-	/* page has been sent to the disk successfully, page is unlocked */
-	PAGE_SUCCESS,
-	/* page is clean and locked */
-	PAGE_CLEAN,
-} pageout_t;
+#include "internal.h"
 
 struct scan_control {
-	/* Ask refill_inactive_zone, or shrink_cache to scan this many pages */
-	unsigned long nr_to_scan;
-
 	/* Incremented by the number of inactive pages that were scanned */
 	unsigned long nr_scanned;
 
-	/* Incremented by the number of pages reclaimed */
-	unsigned long nr_reclaimed;
-
 	unsigned long nr_mapped;	/* From page_state */
 
-	/* Ask shrink_caches, or shrink_zone to scan at this priority */
-	unsigned int priority;
-
 	/* This context's GFP mask */
 	gfp_t gfp_mask;
 
@@ -183,10 +165,11 @@ EXPORT_SYMBOL(remove_shrinker);
  *
  * Returns the number of slab objects which we shrunk.
  */
-int shrink_slab(unsigned long scanned, gfp_t gfp_mask, unsigned long lru_pages)
+unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
+			unsigned long lru_pages)
 {
 	struct shrinker *shrinker;
-	int ret = 0;
+	unsigned long ret = 0;
 
 	if (scanned == 0)
 		scanned = SWAP_CLUSTER_MAX;
@@ -306,9 +289,10 @@ static void handle_write_error(struct address_space *mapping,
 }
 
 /*
- * pageout is called by shrink_list() for each dirty page. Calls ->writepage().
+ * pageout is called by shrink_page_list() for each dirty page.
+ * Calls ->writepage().
  */
-static pageout_t pageout(struct page *page, struct address_space *mapping)
+pageout_t pageout(struct page *page, struct address_space *mapping)
 {
 	/*
 	 * If the page is dirty, only perform writeback if that write
@@ -376,7 +360,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping)
 	return PAGE_CLEAN;
 }
 
-static int remove_mapping(struct address_space *mapping, struct page *page)
+int remove_mapping(struct address_space *mapping, struct page *page)
 {
 	if (!mapping)
 		return 0;		/* truncate got there first */
@@ -414,14 +398,15 @@ cannot_free:
 }
 
 /*
- * shrink_list adds the number of reclaimed pages to sc->nr_reclaimed
+ * shrink_page_list() returns the number of reclaimed pages
  */
-static int shrink_list(struct list_head *page_list, struct scan_control *sc)
+static unsigned long shrink_page_list(struct list_head *page_list,
+					struct scan_control *sc)
 {
 	LIST_HEAD(ret_pages);
 	struct pagevec freed_pvec;
 	int pgactivate = 0;
-	int reclaimed = 0;
+	unsigned long nr_reclaimed = 0;
 
 	cond_resched();
 
@@ -464,12 +449,9 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
 		 * Anonymous process memory has backing store?
 		 * Try to allocate it some swap space here.
 		 */
-		if (PageAnon(page) && !PageSwapCache(page)) {
-			if (!sc->may_swap)
-				goto keep_locked;
+		if (PageAnon(page) && !PageSwapCache(page))
 			if (!add_to_swap(page, GFP_ATOMIC))
 				goto activate_locked;
-		}
 #endif /* CONFIG_SWAP */
 
 		mapping = page_mapping(page);
@@ -481,12 +463,6 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
 		 * processes. Try to unmap it here.
 		 */
 		if (page_mapped(page) && mapping) {
-			/*
-			 * No unmapping if we do not swap
-			 */
-			if (!sc->may_swap)
-				goto keep_locked;
-
 			switch (try_to_unmap(page, 0)) {
 			case SWAP_FAIL:
 				goto activate_locked;
@@ -561,7 +537,7 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
 
 free_it:
 		unlock_page(page);
-		reclaimed++;
+		nr_reclaimed++;
 		if (!pagevec_add(&freed_pvec, page))
 			__pagevec_release_nonlru(&freed_pvec);
 		continue;
@@ -579,483 +555,8 @@ keep:
 	if (pagevec_count(&freed_pvec))
 		__pagevec_release_nonlru(&freed_pvec);
 	mod_page_state(pgactivate, pgactivate);
-	sc->nr_reclaimed += reclaimed;
-	return reclaimed;
-}
-
-#ifdef CONFIG_MIGRATION
-static inline void move_to_lru(struct page *page)
-{
-	list_del(&page->lru);
-	if (PageActive(page)) {
-		/*
-		 * lru_cache_add_active checks that
-		 * the PG_active bit is off.
-		 */
-		ClearPageActive(page);
-		lru_cache_add_active(page);
-	} else {
-		lru_cache_add(page);
-	}
-	put_page(page);
-}
-
-/*
- * Add isolated pages on the list back to the LRU.
- *
- * returns the number of pages put back.
- */
-int putback_lru_pages(struct list_head *l)
-{
-	struct page *page;
-	struct page *page2;
-	int count = 0;
-
-	list_for_each_entry_safe(page, page2, l, lru) {
-		move_to_lru(page);
-		count++;
-	}
-	return count;
-}
-
-/*
- * Non migratable page
- */
-int fail_migrate_page(struct page *newpage, struct page *page)
-{
-	return -EIO;
-}
-EXPORT_SYMBOL(fail_migrate_page);
-
-/*
- * swapout a single page
- * page is locked upon entry, unlocked on exit
- */
-static int swap_page(struct page *page)
-{
-	struct address_space *mapping = page_mapping(page);
-
-	if (page_mapped(page) && mapping)
-		if (try_to_unmap(page, 1) != SWAP_SUCCESS)
-			goto unlock_retry;
-
-	if (PageDirty(page)) {
-		/* Page is dirty, try to write it out here */
-		switch(pageout(page, mapping)) {
-		case PAGE_KEEP:
-		case PAGE_ACTIVATE:
-			goto unlock_retry;
-
-		case PAGE_SUCCESS:
-			goto retry;
-
-		case PAGE_CLEAN:
-			; /* try to free the page below */
-		}
-	}
-
-	if (PagePrivate(page)) {
-		if (!try_to_release_page(page, GFP_KERNEL) ||
-		    (!mapping && page_count(page) == 1))
-			goto unlock_retry;
-	}
-
-	if (remove_mapping(mapping, page)) {
-		/* Success */
-		unlock_page(page);
-		return 0;
-	}
-
-unlock_retry:
-	unlock_page(page);
-
-retry:
-	return -EAGAIN;
-}
-EXPORT_SYMBOL(swap_page);
-
-/*
- * Page migration was first developed in the context of the memory hotplug
- * project. The main authors of the migration code are:
- *
- * IWAMOTO Toshihiro <iwamoto@valinux.co.jp>
- * Hirokazu Takahashi <taka@valinux.co.jp>
- * Dave Hansen <haveblue@us.ibm.com>
- * Christoph Lameter <clameter@sgi.com>
- */
-
-/*
- * Remove references for a page and establish the new page with the correct
- * basic settings to be able to stop accesses to the page.
- */
-int migrate_page_remove_references(struct page *newpage,
-				struct page *page, int nr_refs)
-{
-	struct address_space *mapping = page_mapping(page);
-	struct page **radix_pointer;
-
-	/*
-	 * Avoid doing any of the following work if the page count
-	 * indicates that the page is in use or truncate has removed
-	 * the page.
-	 */
-	if (!mapping || page_mapcount(page) + nr_refs != page_count(page))
-		return -EAGAIN;
-
-	/*
-	 * Establish swap ptes for anonymous pages or destroy pte
-	 * maps for files.
-	 *
-	 * In order to reestablish file backed mappings the fault handlers
-	 * will take the radix tree_lock which may then be used to stop
-  	 * processses from accessing this page until the new page is ready.
-	 *
-	 * A process accessing via a swap pte (an anonymous page) will take a
-	 * page_lock on the old page which will block the process until the
-	 * migration attempt is complete. At that time the PageSwapCache bit
-	 * will be examined. If the page was migrated then the PageSwapCache
-	 * bit will be clear and the operation to retrieve the page will be
-	 * retried which will find the new page in the radix tree. Then a new
-	 * direct mapping may be generated based on the radix tree contents.
-	 *
-	 * If the page was not migrated then the PageSwapCache bit
-	 * is still set and the operation may continue.
-	 */
-	if (try_to_unmap(page, 1) == SWAP_FAIL)
-		/* A vma has VM_LOCKED set -> Permanent failure */
-		return -EPERM;
-
-	/*
-	 * Give up if we were unable to remove all mappings.
-	 */
-	if (page_mapcount(page))
-		return -EAGAIN;
-
-	write_lock_irq(&mapping->tree_lock);
-
-	radix_pointer = (struct page **)radix_tree_lookup_slot(
-						&mapping->page_tree,
-						page_index(page));
-
-	if (!page_mapping(page) || page_count(page) != nr_refs ||
-			*radix_pointer != page) {
-		write_unlock_irq(&mapping->tree_lock);
-		return -EAGAIN;
-	}
-
-	/*
-	 * Now we know that no one else is looking at the page.
-	 *
-	 * Certain minimal information about a page must be available
-	 * in order for other subsystems to properly handle the page if they
-	 * find it through the radix tree update before we are finished
-	 * copying the page.
-	 */
-	get_page(newpage);
-	newpage->index = page->index;
-	newpage->mapping = page->mapping;
-	if (PageSwapCache(page)) {
-		SetPageSwapCache(newpage);
-		set_page_private(newpage, page_private(page));
-	}
-
-	*radix_pointer = newpage;
-	__put_page(page);
-	write_unlock_irq(&mapping->tree_lock);
-
-	return 0;
-}
-EXPORT_SYMBOL(migrate_page_remove_references);
-
-/*
- * Copy the page to its new location
- */
-void migrate_page_copy(struct page *newpage, struct page *page)
-{
-	copy_highpage(newpage, page);
-
-	if (PageError(page))
-		SetPageError(newpage);
-	if (PageReferenced(page))
-		SetPageReferenced(newpage);
-	if (PageUptodate(page))
-		SetPageUptodate(newpage);
-	if (PageActive(page))
-		SetPageActive(newpage);
-	if (PageChecked(page))
-		SetPageChecked(newpage);
-	if (PageMappedToDisk(page))
-		SetPageMappedToDisk(newpage);
-
-	if (PageDirty(page)) {
-		clear_page_dirty_for_io(page);
-		set_page_dirty(newpage);
- 	}
-
-	ClearPageSwapCache(page);
-	ClearPageActive(page);
-	ClearPagePrivate(page);
-	set_page_private(page, 0);
-	page->mapping = NULL;
-
-	/*
-	 * If any waiters have accumulated on the new page then
-	 * wake them up.
-	 */
-	if (PageWriteback(newpage))
-		end_page_writeback(newpage);
-}
-EXPORT_SYMBOL(migrate_page_copy);
-
-/*
- * Common logic to directly migrate a single page suitable for
- * pages that do not use PagePrivate.
- *
- * Pages are locked upon entry and exit.
- */
-int migrate_page(struct page *newpage, struct page *page)
-{
-	int rc;
-
-	BUG_ON(PageWriteback(page));	/* Writeback must be complete */
-
-	rc = migrate_page_remove_references(newpage, page, 2);
-
-	if (rc)
-		return rc;
-
-	migrate_page_copy(newpage, page);
-
-	/*
-	 * Remove auxiliary swap entries and replace
-	 * them with real ptes.
-	 *
-	 * Note that a real pte entry will allow processes that are not
-	 * waiting on the page lock to use the new page via the page tables
-	 * before the new page is unlocked.
-	 */
-	remove_from_swap(newpage);
-	return 0;
+	return nr_reclaimed;
 }
-EXPORT_SYMBOL(migrate_page);
-
-/*
- * migrate_pages
- *
- * Two lists are passed to this function. The first list
- * contains the pages isolated from the LRU to be migrated.
- * The second list contains new pages that the pages isolated
- * can be moved to. If the second list is NULL then all
- * pages are swapped out.
- *
- * The function returns after 10 attempts or if no pages
- * are movable anymore because to has become empty
- * or no retryable pages exist anymore.
- *
- * Return: Number of pages not migrated when "to" ran empty.
- */
-int migrate_pages(struct list_head *from, struct list_head *to,
-		  struct list_head *moved, struct list_head *failed)
-{
-	int retry;
-	int nr_failed = 0;
-	int pass = 0;
-	struct page *page;
-	struct page *page2;
-	int swapwrite = current->flags & PF_SWAPWRITE;
-	int rc;
-
-	if (!swapwrite)
-		current->flags |= PF_SWAPWRITE;
-
-redo:
-	retry = 0;
-
-	list_for_each_entry_safe(page, page2, from, lru) {
-		struct page *newpage = NULL;
-		struct address_space *mapping;
-
-		cond_resched();
-
-		rc = 0;
-		if (page_count(page) == 1)
-			/* page was freed from under us. So we are done. */
-			goto next;
-
-		if (to && list_empty(to))
-			break;
-
-		/*
-		 * Skip locked pages during the first two passes to give the
-		 * functions holding the lock time to release the page. Later we
-		 * use lock_page() to have a higher chance of acquiring the
-		 * lock.
-		 */
-		rc = -EAGAIN;
-		if (pass > 2)
-			lock_page(page);
-		else
-			if (TestSetPageLocked(page))
-				goto next;
-
-		/*
-		 * Only wait on writeback if we have already done a pass where
-		 * we we may have triggered writeouts for lots of pages.
-		 */
-		if (pass > 0) {
-			wait_on_page_writeback(page);
-		} else {
-			if (PageWriteback(page))
-				goto unlock_page;
-		}
-
-		/*
-		 * Anonymous pages must have swap cache references otherwise
-		 * the information contained in the page maps cannot be
-		 * preserved.
-		 */
-		if (PageAnon(page) && !PageSwapCache(page)) {
-			if (!add_to_swap(page, GFP_KERNEL)) {
-				rc = -ENOMEM;
-				goto unlock_page;
-			}
-		}
-
-		if (!to) {
-			rc = swap_page(page);
-			goto next;
-		}
-
-		newpage = lru_to_page(to);
-		lock_page(newpage);
-
-		/*
-		 * Pages are properly locked and writeback is complete.
-		 * Try to migrate the page.
-		 */
-		mapping = page_mapping(page);
-		if (!mapping)
-			goto unlock_both;
-
-		if (mapping->a_ops->migratepage) {
-			/*
-			 * Most pages have a mapping and most filesystems
-			 * should provide a migration function. Anonymous
-			 * pages are part of swap space which also has its
-			 * own migration function. This is the most common
-			 * path for page migration.
-			 */
-			rc = mapping->a_ops->migratepage(newpage, page);
-			goto unlock_both;
-                }
-
-		/*
-		 * Default handling if a filesystem does not provide
-		 * a migration function. We can only migrate clean
-		 * pages so try to write out any dirty pages first.
-		 */
-		if (PageDirty(page)) {
-			switch (pageout(page, mapping)) {
-			case PAGE_KEEP:
-			case PAGE_ACTIVATE:
-				goto unlock_both;
-
-			case PAGE_SUCCESS:
-				unlock_page(newpage);
-				goto next;
-
-			case PAGE_CLEAN:
-				; /* try to migrate the page below */
-			}
-                }
-
-		/*
-		 * Buffers are managed in a filesystem specific way.
-		 * We must have no buffers or drop them.
-		 */
-		if (!page_has_buffers(page) ||
-		    try_to_release_page(page, GFP_KERNEL)) {
-			rc = migrate_page(newpage, page);
-			goto unlock_both;
-		}
-
-		/*
-		 * On early passes with mapped pages simply
-		 * retry. There may be a lock held for some
-		 * buffers that may go away. Later
-		 * swap them out.
-		 */
-		if (pass > 4) {
-			/*
-			 * Persistently unable to drop buffers..... As a
-			 * measure of last resort we fall back to
-			 * swap_page().
-			 */
-			unlock_page(newpage);
-			newpage = NULL;
-			rc = swap_page(page);
-			goto next;
-		}
-
-unlock_both:
-		unlock_page(newpage);
-
-unlock_page:
-		unlock_page(page);
-
-next:
-		if (rc == -EAGAIN) {
-			retry++;
-		} else if (rc) {
-			/* Permanent failure */
-			list_move(&page->lru, failed);
-			nr_failed++;
-		} else {
-			if (newpage) {
-				/* Successful migration. Return page to LRU */
-				move_to_lru(newpage);
-			}
-			list_move(&page->lru, moved);
-		}
-	}
-	if (retry && pass++ < 10)
-		goto redo;
-
-	if (!swapwrite)
-		current->flags &= ~PF_SWAPWRITE;
-
-	return nr_failed + retry;
-}
-
-/*
- * Isolate one page from the LRU lists and put it on the
- * indicated list with elevated refcount.
- *
- * Result:
- *  0 = page not on LRU list
- *  1 = page removed from LRU list and added to the specified list.
- */
-int isolate_lru_page(struct page *page)
-{
-	int ret = 0;
-
-	if (PageLRU(page)) {
-		struct zone *zone = page_zone(page);
-		spin_lock_irq(&zone->lru_lock);
-		if (TestClearPageLRU(page)) {
-			ret = 1;
-			get_page(page);
-			if (PageActive(page))
-				del_page_from_active_list(zone, page);
-			else
-				del_page_from_inactive_list(zone, page);
-		}
-		spin_unlock_irq(&zone->lru_lock);
-	}
-
-	return ret;
-}
-#endif
 
 /*
  * zone->lru_lock is heavily contended.  Some of the functions that
@@ -1074,32 +575,35 @@ int isolate_lru_page(struct page *page)
  *
  * returns how many pages were moved onto *@dst.
  */
-static int isolate_lru_pages(int nr_to_scan, struct list_head *src,
-			     struct list_head *dst, int *scanned)
+static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
+		struct list_head *src, struct list_head *dst,
+		unsigned long *scanned)
 {
-	int nr_taken = 0;
+	unsigned long nr_taken = 0;
 	struct page *page;
-	int scan = 0;
+	unsigned long scan;
 
-	while (scan++ < nr_to_scan && !list_empty(src)) {
+	for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
+		struct list_head *target;
 		page = lru_to_page(src);
 		prefetchw_prev_lru_page(page, src, flags);
 
-		if (!TestClearPageLRU(page))
-			BUG();
+		BUG_ON(!PageLRU(page));
+
 		list_del(&page->lru);
-		if (get_page_testone(page)) {
+		target = src;
+		if (likely(get_page_unless_zero(page))) {
 			/*
-			 * It is being freed elsewhere
+			 * Be careful not to clear PageLRU until after we're
+			 * sure the page is not being freed elsewhere -- the
+			 * page release code relies on it.
 			 */
-			__put_page(page);
-			SetPageLRU(page);
-			list_add(&page->lru, src);
-			continue;
-		} else {
-			list_add(&page->lru, dst);
+			ClearPageLRU(page);
+			target = dst;
 			nr_taken++;
-		}
+		} /* else it is being freed elsewhere */
+
+		list_add(&page->lru, target);
 	}
 
 	*scanned = scan;
@@ -1107,23 +611,26 @@ static int isolate_lru_pages(int nr_to_scan, struct list_head *src,
 }
 
 /*
- * shrink_cache() adds the number of pages reclaimed to sc->nr_reclaimed
+ * shrink_inactive_list() is a helper for shrink_zone().  It returns the number
+ * of reclaimed pages
  */
-static void shrink_cache(struct zone *zone, struct scan_control *sc)
+static unsigned long shrink_inactive_list(unsigned long max_scan,
+				struct zone *zone, struct scan_control *sc)
 {
 	LIST_HEAD(page_list);
 	struct pagevec pvec;
-	int max_scan = sc->nr_to_scan;
+	unsigned long nr_scanned = 0;
+	unsigned long nr_reclaimed = 0;
 
 	pagevec_init(&pvec, 1);
 
 	lru_add_drain();
 	spin_lock_irq(&zone->lru_lock);
-	while (max_scan > 0) {
+	do {
 		struct page *page;
-		int nr_taken;
-		int nr_scan;
-		int nr_freed;
+		unsigned long nr_taken;
+		unsigned long nr_scan;
+		unsigned long nr_freed;
 
 		nr_taken = isolate_lru_pages(sc->swap_cluster_max,
 					     &zone->inactive_list,
@@ -1132,12 +639,9 @@ static void shrink_cache(struct zone *zone, struct scan_control *sc)
 		zone->pages_scanned += nr_scan;
 		spin_unlock_irq(&zone->lru_lock);
 
-		if (nr_taken == 0)
-			goto done;
-
-		max_scan -= nr_scan;
-		nr_freed = shrink_list(&page_list, sc);
-
+		nr_scanned += nr_scan;
+		nr_freed = shrink_page_list(&page_list, sc);
+		nr_reclaimed += nr_freed;
 		local_irq_disable();
 		if (current_is_kswapd()) {
 			__mod_page_state_zone(zone, pgscan_kswapd, nr_scan);
@@ -1146,14 +650,17 @@ static void shrink_cache(struct zone *zone, struct scan_control *sc)
 			__mod_page_state_zone(zone, pgscan_direct, nr_scan);
 		__mod_page_state_zone(zone, pgsteal, nr_freed);
 
+		if (nr_taken == 0)
+			goto done;
+
 		spin_lock(&zone->lru_lock);
 		/*
 		 * Put back any unfreeable pages.
 		 */
 		while (!list_empty(&page_list)) {
 			page = lru_to_page(&page_list);
-			if (TestSetPageLRU(page))
-				BUG();
+			BUG_ON(PageLRU(page));
+			SetPageLRU(page);
 			list_del(&page->lru);
 			if (PageActive(page))
 				add_page_to_active_list(zone, page);
@@ -1165,10 +672,12 @@ static void shrink_cache(struct zone *zone, struct scan_control *sc)
 				spin_lock_irq(&zone->lru_lock);
 			}
 		}
-  	}
-	spin_unlock_irq(&zone->lru_lock);
+  	} while (nr_scanned < max_scan);
+	spin_unlock(&zone->lru_lock);
 done:
+	local_irq_enable();
 	pagevec_release(&pvec);
+	return nr_reclaimed;
 }
 
 /*
@@ -1188,13 +697,12 @@ done:
  * The downside is that we have to touch page->_count against each page.
  * But we had to alter page->flags anyway.
  */
-static void
-refill_inactive_zone(struct zone *zone, struct scan_control *sc)
+static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
+				struct scan_control *sc)
 {
-	int pgmoved;
+	unsigned long pgmoved;
 	int pgdeactivate = 0;
-	int pgscanned;
-	int nr_pages = sc->nr_to_scan;
+	unsigned long pgscanned;
 	LIST_HEAD(l_hold);	/* The pages which were snipped off */
 	LIST_HEAD(l_inactive);	/* Pages to go onto the inactive_list */
 	LIST_HEAD(l_active);	/* Pages to go onto the active_list */
@@ -1202,7 +710,7 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc)
 	struct pagevec pvec;
 	int reclaim_mapped = 0;
 
-	if (unlikely(sc->may_swap)) {
+	if (sc->may_swap) {
 		long mapped_ratio;
 		long distress;
 		long swap_tendency;
@@ -1272,10 +780,11 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc)
 	while (!list_empty(&l_inactive)) {
 		page = lru_to_page(&l_inactive);
 		prefetchw_prev_lru_page(page, &l_inactive, flags);
-		if (TestSetPageLRU(page))
-			BUG();
-		if (!TestClearPageActive(page))
-			BUG();
+		BUG_ON(PageLRU(page));
+		SetPageLRU(page);
+		BUG_ON(!PageActive(page));
+		ClearPageActive(page);
+
 		list_move(&page->lru, &zone->inactive_list);
 		pgmoved++;
 		if (!pagevec_add(&pvec, page)) {
@@ -1301,8 +810,8 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc)
 	while (!list_empty(&l_active)) {
 		page = lru_to_page(&l_active);
 		prefetchw_prev_lru_page(page, &l_active, flags);
-		if (TestSetPageLRU(page))
-			BUG();
+		BUG_ON(PageLRU(page));
+		SetPageLRU(page);
 		BUG_ON(!PageActive(page));
 		list_move(&page->lru, &zone->active_list);
 		pgmoved++;
@@ -1327,11 +836,13 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc)
 /*
  * This is a basic per-zone page freer.  Used by both kswapd and direct reclaim.
  */
-static void
-shrink_zone(struct zone *zone, struct scan_control *sc)
+static unsigned long shrink_zone(int priority, struct zone *zone,
+				struct scan_control *sc)
 {
 	unsigned long nr_active;
 	unsigned long nr_inactive;
+	unsigned long nr_to_scan;
+	unsigned long nr_reclaimed = 0;
 
 	atomic_inc(&zone->reclaim_in_progress);
 
@@ -1339,14 +850,14 @@ shrink_zone(struct zone *zone, struct scan_control *sc)
 	 * Add one to `nr_to_scan' just to make sure that the kernel will
 	 * slowly sift through the active list.
 	 */
-	zone->nr_scan_active += (zone->nr_active >> sc->priority) + 1;
+	zone->nr_scan_active += (zone->nr_active >> priority) + 1;
 	nr_active = zone->nr_scan_active;
 	if (nr_active >= sc->swap_cluster_max)
 		zone->nr_scan_active = 0;
 	else
 		nr_active = 0;
 
-	zone->nr_scan_inactive += (zone->nr_inactive >> sc->priority) + 1;
+	zone->nr_scan_inactive += (zone->nr_inactive >> priority) + 1;
 	nr_inactive = zone->nr_scan_inactive;
 	if (nr_inactive >= sc->swap_cluster_max)
 		zone->nr_scan_inactive = 0;
@@ -1355,23 +866,25 @@ shrink_zone(struct zone *zone, struct scan_control *sc)
 
 	while (nr_active || nr_inactive) {
 		if (nr_active) {
-			sc->nr_to_scan = min(nr_active,
+			nr_to_scan = min(nr_active,
 					(unsigned long)sc->swap_cluster_max);
-			nr_active -= sc->nr_to_scan;
-			refill_inactive_zone(zone, sc);
+			nr_active -= nr_to_scan;
+			shrink_active_list(nr_to_scan, zone, sc);
 		}
 
 		if (nr_inactive) {
-			sc->nr_to_scan = min(nr_inactive,
+			nr_to_scan = min(nr_inactive,
 					(unsigned long)sc->swap_cluster_max);
-			nr_inactive -= sc->nr_to_scan;
-			shrink_cache(zone, sc);
+			nr_inactive -= nr_to_scan;
+			nr_reclaimed += shrink_inactive_list(nr_to_scan, zone,
+								sc);
 		}
 	}
 
 	throttle_vm_writeout();
 
 	atomic_dec(&zone->reclaim_in_progress);
+	return nr_reclaimed;
 }
 
 /*
@@ -1390,9 +903,10 @@ shrink_zone(struct zone *zone, struct scan_control *sc)
  * If a zone is deemed to be full of pinned pages then just give it a light
  * scan then give up on it.
  */
-static void
-shrink_caches(struct zone **zones, struct scan_control *sc)
+static unsigned long shrink_zones(int priority, struct zone **zones,
+					struct scan_control *sc)
 {
+	unsigned long nr_reclaimed = 0;
 	int i;
 
 	for (i = 0; zones[i] != NULL; i++) {
@@ -1404,15 +918,16 @@ shrink_caches(struct zone **zones, struct scan_control *sc)
 		if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
 			continue;
 
-		zone->temp_priority = sc->priority;
-		if (zone->prev_priority > sc->priority)
-			zone->prev_priority = sc->priority;
+		zone->temp_priority = priority;
+		if (zone->prev_priority > priority)
+			zone->prev_priority = priority;
 
-		if (zone->all_unreclaimable && sc->priority != DEF_PRIORITY)
+		if (zone->all_unreclaimable && priority != DEF_PRIORITY)
 			continue;	/* Let kswapd poll it */
 
-		shrink_zone(zone, sc);
+		nr_reclaimed += shrink_zone(priority, zone, sc);
 	}
+	return nr_reclaimed;
 }
  
 /*
@@ -1428,19 +943,21 @@ shrink_caches(struct zone **zones, struct scan_control *sc)
  * holds filesystem locks which prevent writeout this might not work, and the
  * allocation attempt will fail.
  */
-int try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
+unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
 {
 	int priority;
 	int ret = 0;
-	int total_scanned = 0, total_reclaimed = 0;
+	unsigned long total_scanned = 0;
+	unsigned long nr_reclaimed = 0;
 	struct reclaim_state *reclaim_state = current->reclaim_state;
-	struct scan_control sc;
 	unsigned long lru_pages = 0;
 	int i;
-
-	sc.gfp_mask = gfp_mask;
-	sc.may_writepage = !laptop_mode;
-	sc.may_swap = 1;
+	struct scan_control sc = {
+		.gfp_mask = gfp_mask,
+		.may_writepage = !laptop_mode,
+		.swap_cluster_max = SWAP_CLUSTER_MAX,
+		.may_swap = 1,
+	};
 
 	inc_page_state(allocstall);
 
@@ -1457,20 +974,16 @@ int try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
 	for (priority = DEF_PRIORITY; priority >= 0; priority--) {
 		sc.nr_mapped = read_page_state(nr_mapped);
 		sc.nr_scanned = 0;
-		sc.nr_reclaimed = 0;
-		sc.priority = priority;
-		sc.swap_cluster_max = SWAP_CLUSTER_MAX;
 		if (!priority)
 			disable_swap_token();
-		shrink_caches(zones, &sc);
+		nr_reclaimed += shrink_zones(priority, zones, &sc);
 		shrink_slab(sc.nr_scanned, gfp_mask, lru_pages);
 		if (reclaim_state) {
-			sc.nr_reclaimed += reclaim_state->reclaimed_slab;
+			nr_reclaimed += reclaim_state->reclaimed_slab;
 			reclaim_state->reclaimed_slab = 0;
 		}
 		total_scanned += sc.nr_scanned;
-		total_reclaimed += sc.nr_reclaimed;
-		if (total_reclaimed >= sc.swap_cluster_max) {
+		if (nr_reclaimed >= sc.swap_cluster_max) {
 			ret = 1;
 			goto out;
 		}
@@ -1482,7 +995,8 @@ int try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
 		 * that's undesirable in laptop mode, where we *want* lumpy
 		 * writeout.  So in laptop mode, write out the whole world.
 		 */
-		if (total_scanned > sc.swap_cluster_max + sc.swap_cluster_max/2) {
+		if (total_scanned > sc.swap_cluster_max +
+					sc.swap_cluster_max / 2) {
 			wakeup_pdflush(laptop_mode ? 0 : total_scanned);
 			sc.may_writepage = 1;
 		}
@@ -1528,22 +1042,26 @@ out:
  * the page allocator fallback scheme to ensure that aging of pages is balanced
  * across the zones.
  */
-static int balance_pgdat(pg_data_t *pgdat, int nr_pages, int order)
+static unsigned long balance_pgdat(pg_data_t *pgdat, unsigned long nr_pages,
+				int order)
 {
-	int to_free = nr_pages;
+	unsigned long to_free = nr_pages;
 	int all_zones_ok;
 	int priority;
 	int i;
-	int total_scanned, total_reclaimed;
+	unsigned long total_scanned;
+	unsigned long nr_reclaimed;
 	struct reclaim_state *reclaim_state = current->reclaim_state;
-	struct scan_control sc;
+	struct scan_control sc = {
+		.gfp_mask = GFP_KERNEL,
+		.may_swap = 1,
+		.swap_cluster_max = nr_pages ? nr_pages : SWAP_CLUSTER_MAX,
+	};
 
 loop_again:
 	total_scanned = 0;
-	total_reclaimed = 0;
-	sc.gfp_mask = GFP_KERNEL;
-	sc.may_writepage = !laptop_mode;
-	sc.may_swap = 1;
+	nr_reclaimed = 0;
+	sc.may_writepage = !laptop_mode,
 	sc.nr_mapped = read_page_state(nr_mapped);
 
 	inc_page_state(pageoutrun);
@@ -1624,15 +1142,11 @@ scan:
 			if (zone->prev_priority > priority)
 				zone->prev_priority = priority;
 			sc.nr_scanned = 0;
-			sc.nr_reclaimed = 0;
-			sc.priority = priority;
-			sc.swap_cluster_max = nr_pages? nr_pages : SWAP_CLUSTER_MAX;
-			shrink_zone(zone, &sc);
+			nr_reclaimed += shrink_zone(priority, zone, &sc);
 			reclaim_state->reclaimed_slab = 0;
 			nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
 						lru_pages);
-			sc.nr_reclaimed += reclaim_state->reclaimed_slab;
-			total_reclaimed += sc.nr_reclaimed;
+			nr_reclaimed += reclaim_state->reclaimed_slab;
 			total_scanned += sc.nr_scanned;
 			if (zone->all_unreclaimable)
 				continue;
@@ -1645,10 +1159,10 @@ scan:
 			 * even in laptop mode
 			 */
 			if (total_scanned > SWAP_CLUSTER_MAX * 2 &&
-			    total_scanned > total_reclaimed+total_reclaimed/2)
+			    total_scanned > nr_reclaimed + nr_reclaimed / 2)
 				sc.may_writepage = 1;
 		}
-		if (nr_pages && to_free > total_reclaimed)
+		if (nr_pages && to_free > nr_reclaimed)
 			continue;	/* swsusp: need to do more work */
 		if (all_zones_ok)
 			break;		/* kswapd: all done */
@@ -1665,7 +1179,7 @@ scan:
 		 * matches the direct reclaim path behaviour in terms of impact
 		 * on zone->*_priority.
 		 */
-		if ((total_reclaimed >= SWAP_CLUSTER_MAX) && (!nr_pages))
+		if ((nr_reclaimed >= SWAP_CLUSTER_MAX) && !nr_pages)
 			break;
 	}
 out:
@@ -1679,7 +1193,7 @@ out:
 		goto loop_again;
 	}
 
-	return total_reclaimed;
+	return nr_reclaimed;
 }
 
 /*
@@ -1779,24 +1293,31 @@ void wakeup_kswapd(struct zone *zone, int order)
  * Try to free `nr_pages' of memory, system-wide.  Returns the number of freed
  * pages.
  */
-int shrink_all_memory(int nr_pages)
+unsigned long shrink_all_memory(unsigned long nr_pages)
 {
 	pg_data_t *pgdat;
-	int nr_to_free = nr_pages;
-	int ret = 0;
+	unsigned long nr_to_free = nr_pages;
+	unsigned long ret = 0;
+	unsigned retry = 2;
 	struct reclaim_state reclaim_state = {
 		.reclaimed_slab = 0,
 	};
 
 	current->reclaim_state = &reclaim_state;
+repeat:
 	for_each_pgdat(pgdat) {
-		int freed;
+		unsigned long freed;
+
 		freed = balance_pgdat(pgdat, nr_to_free, 0);
 		ret += freed;
 		nr_to_free -= freed;
-		if (nr_to_free <= 0)
+		if ((long)nr_to_free <= 0)
 			break;
 	}
+	if (retry-- && ret < nr_pages) {
+		blk_congestion_wait(WRITE, HZ/5);
+		goto repeat;
+	}
 	current->reclaim_state = NULL;
 	return ret;
 }
@@ -1808,8 +1329,7 @@ int shrink_all_memory(int nr_pages)
    away, we get changed to run anywhere: as the first one comes back,
    restore their cpu bindings. */
 static int __devinit cpu_callback(struct notifier_block *nfb,
-				  unsigned long action,
-				  void *hcpu)
+				  unsigned long action, void *hcpu)
 {
 	pg_data_t *pgdat;
 	cpumask_t mask;
@@ -1829,10 +1349,15 @@ static int __devinit cpu_callback(struct notifier_block *nfb,
 static int __init kswapd_init(void)
 {
 	pg_data_t *pgdat;
+
 	swap_setup();
-	for_each_pgdat(pgdat)
-		pgdat->kswapd
-		= find_task_by_pid(kernel_thread(kswapd, pgdat, CLONE_KERNEL));
+	for_each_pgdat(pgdat) {
+		pid_t pid;
+
+		pid = kernel_thread(kswapd, pgdat, CLONE_KERNEL);
+		BUG_ON(pid < 0);
+		pgdat->kswapd = find_task_by_pid(pid);
+	}
 	total_memory = nr_free_pagecache_pages();
 	hotcpu_notifier(cpu_callback, 0);
 	return 0;
@@ -1874,46 +1399,24 @@ int zone_reclaim_interval __read_mostly = 30*HZ;
 /*
  * Try to free up some pages from this zone through reclaim.
  */
-int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
+static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 {
-	int nr_pages;
+	/* Minimum pages needed in order to stay on node */
+	const unsigned long nr_pages = 1 << order;
 	struct task_struct *p = current;
 	struct reclaim_state reclaim_state;
-	struct scan_control sc;
-	cpumask_t mask;
-	int node_id;
-
-	if (time_before(jiffies,
-		zone->last_unsuccessful_zone_reclaim + zone_reclaim_interval))
-			return 0;
-
-	if (!(gfp_mask & __GFP_WAIT) ||
-		zone->all_unreclaimable ||
-		atomic_read(&zone->reclaim_in_progress) > 0 ||
-		(p->flags & PF_MEMALLOC))
-			return 0;
-
-	node_id = zone->zone_pgdat->node_id;
-	mask = node_to_cpumask(node_id);
-	if (!cpus_empty(mask) && node_id != numa_node_id())
-		return 0;
-
-	sc.may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE);
-	sc.may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP);
-	sc.nr_scanned = 0;
-	sc.nr_reclaimed = 0;
-	sc.priority = ZONE_RECLAIM_PRIORITY + 1;
-	sc.nr_mapped = read_page_state(nr_mapped);
-	sc.gfp_mask = gfp_mask;
+	int priority;
+	unsigned long nr_reclaimed = 0;
+	struct scan_control sc = {
+		.may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
+		.may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP),
+		.nr_mapped = read_page_state(nr_mapped),
+		.swap_cluster_max = max_t(unsigned long, nr_pages,
+					SWAP_CLUSTER_MAX),
+		.gfp_mask = gfp_mask,
+	};
 
 	disable_swap_token();
-
-	nr_pages = 1 << order;
-	if (nr_pages > SWAP_CLUSTER_MAX)
-		sc.swap_cluster_max = nr_pages;
-	else
-		sc.swap_cluster_max = SWAP_CLUSTER_MAX;
-
 	cond_resched();
 	/*
 	 * We need to be able to allocate from the reserves for RECLAIM_SWAP
@@ -1928,17 +1431,20 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 	 * Free memory by calling shrink zone with increasing priorities
 	 * until we have enough memory freed.
 	 */
+	priority = ZONE_RECLAIM_PRIORITY;
 	do {
-		sc.priority--;
-		shrink_zone(zone, &sc);
+		nr_reclaimed += shrink_zone(priority, zone, &sc);
+		priority--;
+	} while (priority >= 0 && nr_reclaimed < nr_pages);
 
-	} while (sc.nr_reclaimed < nr_pages && sc.priority > 0);
-
-	if (sc.nr_reclaimed < nr_pages && (zone_reclaim_mode & RECLAIM_SLAB)) {
+	if (nr_reclaimed < nr_pages && (zone_reclaim_mode & RECLAIM_SLAB)) {
 		/*
-		 * shrink_slab does not currently allow us to determine
-		 * how many pages were freed in the zone. So we just
-		 * shake the slab and then go offnode for a single allocation.
+		 * shrink_slab() does not currently allow us to determine how
+		 * many pages were freed in this zone. So we just shake the slab
+		 * a bit and then go off node for this particular allocation
+		 * despite possibly having freed enough memory to allocate in
+		 * this zone.  If we freed local memory then the next
+		 * allocations will be local again.
 		 *
 		 * shrink_slab will free memory on all zones and may take
 		 * a long time.
@@ -1949,10 +1455,54 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 	p->reclaim_state = NULL;
 	current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
 
-	if (sc.nr_reclaimed == 0)
+	if (nr_reclaimed == 0) {
+		/*
+		 * We were unable to reclaim enough pages to stay on node.  We
+		 * now allow off node accesses for a certain time period before
+		 * trying again to reclaim pages from the local zone.
+		 */
 		zone->last_unsuccessful_zone_reclaim = jiffies;
+	}
 
-	return sc.nr_reclaimed >= nr_pages;
+	return nr_reclaimed >= nr_pages;
 }
-#endif
 
+int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
+{
+	cpumask_t mask;
+	int node_id;
+
+	/*
+	 * Do not reclaim if there was a recent unsuccessful attempt at zone
+	 * reclaim.  In that case we let allocations go off node for the
+	 * zone_reclaim_interval.  Otherwise we would scan for each off-node
+	 * page allocation.
+	 */
+	if (time_before(jiffies,
+		zone->last_unsuccessful_zone_reclaim + zone_reclaim_interval))
+			return 0;
+
+	/*
+	 * Avoid concurrent zone reclaims, do not reclaim in a zone that does
+	 * not have reclaimable pages and if we should not delay the allocation
+	 * then do not scan.
+	 */
+	if (!(gfp_mask & __GFP_WAIT) ||
+		zone->all_unreclaimable ||
+		atomic_read(&zone->reclaim_in_progress) > 0 ||
+		(current->flags & PF_MEMALLOC))
+			return 0;
+
+	/*
+	 * Only run zone reclaim on the local zone or on zones that do not
+	 * have associated processors. This will favor the local processor
+	 * over remote processors and spread off node memory allocations
+	 * as wide as possible.
+	 */
+	node_id = zone->zone_pgdat->node_id;
+	mask = node_to_cpumask(node_id);
+	if (!cpus_empty(mask) && node_id != numa_node_id())
+		return 0;
+	return __zone_reclaim(zone, gfp_mask, order);
+}
+#endif
diff --git a/security/keys/process_keys.c b/security/keys/process_keys.c
index 74cb79eb917..f6940618e34 100644
--- a/security/keys/process_keys.c
+++ b/security/keys/process_keys.c
@@ -16,11 +16,12 @@
 #include <linux/keyctl.h>
 #include <linux/fs.h>
 #include <linux/err.h>
+#include <linux/mutex.h>
 #include <asm/uaccess.h>
 #include "internal.h"
 
 /* session keyring create vs join semaphore */
-static DECLARE_MUTEX(key_session_sem);
+static DEFINE_MUTEX(key_session_mutex);
 
 /* the root user's tracking struct */
 struct key_user root_key_user = {
@@ -711,7 +712,7 @@ long join_session_keyring(const char *name)
 	}
 
 	/* allow the user to join or create a named keyring */
-	down(&key_session_sem);
+	mutex_lock(&key_session_mutex);
 
 	/* look for an existing keyring of this name */
 	keyring = find_keyring_by_name(name, 0);
@@ -737,7 +738,7 @@ long join_session_keyring(const char *name)
 	key_put(keyring);
 
 error2:
-	up(&key_session_sem);
+	mutex_unlock(&key_session_mutex);
 error:
 	return ret;
 
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 5b16196f282..ccaf988f372 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -117,6 +117,8 @@ static struct security_operations *secondary_ops = NULL;
 static LIST_HEAD(superblock_security_head);
 static DEFINE_SPINLOCK(sb_security_lock);
 
+static kmem_cache_t *sel_inode_cache;
+
 /* Allocate and free functions for each kind of security blob. */
 
 static int task_alloc_security(struct task_struct *task)
@@ -146,10 +148,11 @@ static int inode_alloc_security(struct inode *inode)
 	struct task_security_struct *tsec = current->security;
 	struct inode_security_struct *isec;
 
-	isec = kzalloc(sizeof(struct inode_security_struct), GFP_KERNEL);
+	isec = kmem_cache_alloc(sel_inode_cache, SLAB_KERNEL);
 	if (!isec)
 		return -ENOMEM;
 
+	memset(isec, 0, sizeof(*isec));
 	init_MUTEX(&isec->sem);
 	INIT_LIST_HEAD(&isec->list);
 	isec->inode = inode;
@@ -172,7 +175,7 @@ static void inode_free_security(struct inode *inode)
 	spin_unlock(&sbsec->isec_lock);
 
 	inode->i_security = NULL;
-	kfree(isec);
+	kmem_cache_free(sel_inode_cache, isec);
 }
 
 static int file_alloc_security(struct file *file)
@@ -1929,7 +1932,6 @@ static int selinux_inode_init_security(struct inode *inode, struct inode *dir,
 	struct task_security_struct *tsec;
 	struct inode_security_struct *dsec;
 	struct superblock_security_struct *sbsec;
-	struct inode_security_struct *isec;
 	u32 newsid, clen;
 	int rc;
 	char *namep = NULL, *context;
@@ -1937,7 +1939,6 @@ static int selinux_inode_init_security(struct inode *inode, struct inode *dir,
 	tsec = current->security;
 	dsec = dir->i_security;
 	sbsec = dir->i_sb->s_security;
-	isec = inode->i_security;
 
 	if (tsec->create_sid && sbsec->behavior != SECURITY_FS_USE_MNTPOINT) {
 		newsid = tsec->create_sid;
@@ -1957,7 +1958,7 @@ static int selinux_inode_init_security(struct inode *inode, struct inode *dir,
 
 	inode_security_set_sid(inode, newsid);
 
-	if (sbsec->behavior == SECURITY_FS_USE_MNTPOINT)
+	if (!ss_initialized || sbsec->behavior == SECURITY_FS_USE_MNTPOINT)
 		return -EOPNOTSUPP;
 
 	if (name) {
@@ -4408,6 +4409,9 @@ static __init int selinux_init(void)
 	tsec = current->security;
 	tsec->osid = tsec->sid = SECINITSID_KERNEL;
 
+	sel_inode_cache = kmem_cache_create("selinux_inode_security",
+					    sizeof(struct inode_security_struct),
+					    0, SLAB_PANIC, NULL, NULL);
 	avc_init();
 
 	original_ops = secondary_ops = security_ops;
diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c
index b5fa02d17b1..f5d78365488 100644
--- a/security/selinux/selinuxfs.c
+++ b/security/selinux/selinuxfs.c
@@ -15,6 +15,7 @@
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/fs.h>
+#include <linux/mutex.h>
 #include <linux/init.h>
 #include <linux/string.h>
 #include <linux/security.h>
@@ -44,7 +45,7 @@ static int __init checkreqprot_setup(char *str)
 __setup("checkreqprot=", checkreqprot_setup);
 
 
-static DECLARE_MUTEX(sel_sem);
+static DEFINE_MUTEX(sel_mutex);
 
 /* global data for booleans */
 static struct dentry *bool_dir = NULL;
@@ -230,7 +231,7 @@ static ssize_t sel_write_load(struct file * file, const char __user * buf,
 	ssize_t length;
 	void *data = NULL;
 
-	down(&sel_sem);
+	mutex_lock(&sel_mutex);
 
 	length = task_has_security(current, SECURITY__LOAD_POLICY);
 	if (length)
@@ -262,7 +263,7 @@ static ssize_t sel_write_load(struct file * file, const char __user * buf,
 	else
 		length = count;
 out:
-	up(&sel_sem);
+	mutex_unlock(&sel_mutex);
 	vfree(data);
 	return length;
 }
@@ -709,12 +710,11 @@ static ssize_t sel_read_bool(struct file *filep, char __user *buf,
 {
 	char *page = NULL;
 	ssize_t length;
-	ssize_t end;
 	ssize_t ret;
 	int cur_enforcing;
 	struct inode *inode;
 
-	down(&sel_sem);
+	mutex_lock(&sel_mutex);
 
 	ret = -EFAULT;
 
@@ -740,26 +740,9 @@ static ssize_t sel_read_bool(struct file *filep, char __user *buf,
 
 	length = scnprintf(page, PAGE_SIZE, "%d %d", cur_enforcing,
 			  bool_pending_values[inode->i_ino - BOOL_INO_OFFSET]);
-	if (length < 0) {
-		ret = length;
-		goto out;
-	}
-
-	if (*ppos >= length) {
-		ret = 0;
-		goto out;
-	}
-	if (count + *ppos > length)
-		count = length - *ppos;
-	end = count + *ppos;
-	if (copy_to_user(buf, (char *) page + *ppos, count)) {
-		ret = -EFAULT;
-		goto out;
-	}
-	*ppos = end;
-	ret = count;
+	ret = simple_read_from_buffer(buf, count, ppos, page, length);
 out:
-	up(&sel_sem);
+	mutex_unlock(&sel_mutex);
 	if (page)
 		free_page((unsigned long)page);
 	return ret;
@@ -773,7 +756,7 @@ static ssize_t sel_write_bool(struct file *filep, const char __user *buf,
 	int new_value;
 	struct inode *inode;
 
-	down(&sel_sem);
+	mutex_lock(&sel_mutex);
 
 	length = task_has_security(current, SECURITY__SETBOOL);
 	if (length)
@@ -812,7 +795,7 @@ static ssize_t sel_write_bool(struct file *filep, const char __user *buf,
 	length = count;
 
 out:
-	up(&sel_sem);
+	mutex_unlock(&sel_mutex);
 	if (page)
 		free_page((unsigned long) page);
 	return length;
@@ -831,7 +814,7 @@ static ssize_t sel_commit_bools_write(struct file *filep,
 	ssize_t length = -EFAULT;
 	int new_value;
 
-	down(&sel_sem);
+	mutex_lock(&sel_mutex);
 
 	length = task_has_security(current, SECURITY__SETBOOL);
 	if (length)
@@ -869,7 +852,7 @@ static ssize_t sel_commit_bools_write(struct file *filep,
 	length = count;
 
 out:
-	up(&sel_sem);
+	mutex_unlock(&sel_mutex);
 	if (page)
 		free_page((unsigned long) page);
 	return length;
@@ -987,7 +970,7 @@ out:
 	return ret;
 err:
 	kfree(values);
-	d_genocide(dir);
+	sel_remove_bools(dir);
 	ret = -ENOMEM;
 	goto out;
 }
@@ -1168,37 +1151,38 @@ static int sel_make_avc_files(struct dentry *dir)
 		dentry = d_alloc_name(dir, files[i].name);
 		if (!dentry) {
 			ret = -ENOMEM;
-			goto err;
+			goto out;
 		}
 
 		inode = sel_make_inode(dir->d_sb, S_IFREG|files[i].mode);
 		if (!inode) {
 			ret = -ENOMEM;
-			goto err;
+			goto out;
 		}
 		inode->i_fop = files[i].ops;
 		d_add(dentry, inode);
 	}
 out:
 	return ret;
-err:
-	d_genocide(dir);
-	goto out;
 }
 
-static int sel_make_dir(struct super_block *sb, struct dentry *dentry)
+static int sel_make_dir(struct inode *dir, struct dentry *dentry)
 {
 	int ret = 0;
 	struct inode *inode;
 
-	inode = sel_make_inode(sb, S_IFDIR | S_IRUGO | S_IXUGO);
+	inode = sel_make_inode(dir->i_sb, S_IFDIR | S_IRUGO | S_IXUGO);
 	if (!inode) {
 		ret = -ENOMEM;
 		goto out;
 	}
 	inode->i_op = &simple_dir_inode_operations;
 	inode->i_fop = &simple_dir_operations;
+	/* directory inodes start off with i_nlink == 2 (for "." entry) */
+	inode->i_nlink++;
 	d_add(dentry, inode);
+	/* bump link count on parent directory, too */
+	dir->i_nlink++;
 out:
 	return ret;
 }
@@ -1207,7 +1191,7 @@ static int sel_fill_super(struct super_block * sb, void * data, int silent)
 {
 	int ret;
 	struct dentry *dentry;
-	struct inode *inode;
+	struct inode *inode, *root_inode;
 	struct inode_security_struct *isec;
 
 	static struct tree_descr selinux_files[] = {
@@ -1228,30 +1212,33 @@ static int sel_fill_super(struct super_block * sb, void * data, int silent)
 	};
 	ret = simple_fill_super(sb, SELINUX_MAGIC, selinux_files);
 	if (ret)
-		return ret;
+		goto err;
+
+	root_inode = sb->s_root->d_inode;
 
 	dentry = d_alloc_name(sb->s_root, BOOL_DIR_NAME);
-	if (!dentry)
-		return -ENOMEM;
+	if (!dentry) {
+		ret = -ENOMEM;
+		goto err;
+	}
 
-	inode = sel_make_inode(sb, S_IFDIR | S_IRUGO | S_IXUGO);
-	if (!inode)
-		goto out;
-	inode->i_op = &simple_dir_inode_operations;
-	inode->i_fop = &simple_dir_operations;
-	d_add(dentry, inode);
-	bool_dir = dentry;
-	ret = sel_make_bools();
+	ret = sel_make_dir(root_inode, dentry);
 	if (ret)
-		goto out;
+		goto err;
+
+	bool_dir = dentry;
 
 	dentry = d_alloc_name(sb->s_root, NULL_FILE_NAME);
-	if (!dentry)
-		return -ENOMEM;
+	if (!dentry) {
+		ret = -ENOMEM;
+		goto err;
+	}
 
 	inode = sel_make_inode(sb, S_IFCHR | S_IRUGO | S_IWUGO);
-	if (!inode)
-		goto out;
+	if (!inode) {
+		ret = -ENOMEM;
+		goto err;
+	}
 	isec = (struct inode_security_struct*)inode->i_security;
 	isec->sid = SECINITSID_DEVNULL;
 	isec->sclass = SECCLASS_CHR_FILE;
@@ -1262,22 +1249,23 @@ static int sel_fill_super(struct super_block * sb, void * data, int silent)
 	selinux_null = dentry;
 
 	dentry = d_alloc_name(sb->s_root, "avc");
-	if (!dentry)
-		return -ENOMEM;
+	if (!dentry) {
+		ret = -ENOMEM;
+		goto err;
+	}
 
-	ret = sel_make_dir(sb, dentry);
+	ret = sel_make_dir(root_inode, dentry);
 	if (ret)
-		goto out;
+		goto err;
 
 	ret = sel_make_avc_files(dentry);
 	if (ret)
-		goto out;
-
-	return 0;
+		goto err;
 out:
-	dput(dentry);
+	return ret;
+err:
 	printk(KERN_ERR "%s:  failed while creating inodes\n", __FUNCTION__);
-	return -ENOMEM;
+	goto out;
 }
 
 static struct super_block *sel_get_sb(struct file_system_type *fs_type,
diff --git a/security/selinux/ss/services.c b/security/selinux/ss/services.c
index 8a764928ff4..63e0b7f29cb 100644
--- a/security/selinux/ss/services.c
+++ b/security/selinux/ss/services.c
@@ -27,7 +27,8 @@
 #include <linux/in.h>
 #include <linux/sched.h>
 #include <linux/audit.h>
-#include <asm/semaphore.h>
+#include <linux/mutex.h>
+
 #include "flask.h"
 #include "avc.h"
 #include "avc_ss.h"
@@ -48,9 +49,9 @@ static DEFINE_RWLOCK(policy_rwlock);
 #define POLICY_RDUNLOCK read_unlock(&policy_rwlock)
 #define POLICY_WRUNLOCK write_unlock_irq(&policy_rwlock)
 
-static DECLARE_MUTEX(load_sem);
-#define LOAD_LOCK down(&load_sem)
-#define LOAD_UNLOCK up(&load_sem)
+static DEFINE_MUTEX(load_mutex);
+#define LOAD_LOCK mutex_lock(&load_mutex)
+#define LOAD_UNLOCK mutex_unlock(&load_mutex)
 
 static struct sidtab sidtab;
 struct policydb policydb;