diff options
175 files changed, 6839 insertions, 5527 deletions
diff --git a/Documentation/networking/e100.txt b/Documentation/networking/e100.txt index 4ef9f7cd5dc..944aa55e79f 100644 --- a/Documentation/networking/e100.txt +++ b/Documentation/networking/e100.txt @@ -1,16 +1,17 @@ Linux* Base Driver for the Intel(R) PRO/100 Family of Adapters ============================================================== -November 17, 2004 - +November 15, 2005 Contents ======== - In This Release - Identifying Your Adapter +- Building and Installation - Driver Configuration Parameters - Additional Configurations +- Known Issues - Support @@ -18,18 +19,30 @@ In This Release =============== This file describes the Linux* Base Driver for the Intel(R) PRO/100 Family of -Adapters, version 3.3.x. This driver supports 2.4.x and 2.6.x kernels. +Adapters. This driver includes support for Itanium(R)2-based systems. + +For questions related to hardware requirements, refer to the documentation +supplied with your Intel PRO/100 adapter. + +The following features are now available in supported kernels: + - Native VLANs + - Channel Bonding (teaming) + - SNMP + +Channel Bonding documentation can be found in the Linux kernel source: +/Documentation/networking/bonding.txt + Identifying Your Adapter ======================== -For more information on how to identify your adapter, go to the Adapter & +For more information on how to identify your adapter, go to the Adapter & Driver ID Guide at: http://support.intel.com/support/network/adapter/pro100/21397.htm -For the latest Intel network drivers for Linux, refer to the following -website. In the search field, enter your adapter name or type, or use the +For the latest Intel network drivers for Linux, refer to the following +website. In the search field, enter your adapter name or type, or use the networking link on the left to search for your adapter: http://downloadfinder.intel.com/scripts-df/support_intel.asp @@ -40,73 +53,75 @@ Driver Configuration Parameters The default value for each parameter is generally the recommended setting, unless otherwise noted. -Rx Descriptors: Number of receive descriptors. A receive descriptor is a data - structure that describes a receive buffer and its attributes to the network - controller. The data in the descriptor is used by the controller to write - data from the controller to host memory. In the 3.0.x driver the valid - range for this parameter is 64-256. The default value is 64. This parameter - can be changed using the command - +Rx Descriptors: Number of receive descriptors. A receive descriptor is a data + structure that describes a receive buffer and its attributes to the network + controller. The data in the descriptor is used by the controller to write + data from the controller to host memory. In the 3.x.x driver the valid range + for this parameter is 64-256. The default value is 64. This parameter can be + changed using the command: + ethtool -G eth? rx n, where n is the number of desired rx descriptors. -Tx Descriptors: Number of transmit descriptors. A transmit descriptor is a - data structure that describes a transmit buffer and its attributes to the - network controller. The data in the descriptor is used by the controller to - read data from the host memory to the controller. In the 3.0.x driver the - valid range for this parameter is 64-256. The default value is 64. This - parameter can be changed using the command +Tx Descriptors: Number of transmit descriptors. A transmit descriptor is a data + structure that describes a transmit buffer and its attributes to the network + controller. The data in the descriptor is used by the controller to read + data from the host memory to the controller. In the 3.x.x driver the valid + range for this parameter is 64-256. The default value is 64. This parameter + can be changed using the command: ethtool -G eth? tx n, where n is the number of desired tx descriptors. -Speed/Duplex: The driver auto-negotiates the link speed and duplex settings by - default. Ethtool can be used as follows to force speed/duplex. +Speed/Duplex: The driver auto-negotiates the link speed and duplex settings by + default. Ethtool can be used as follows to force speed/duplex. ethtool -s eth? autoneg off speed {10|100} duplex {full|half} NOTE: setting the speed/duplex to incorrect values will cause the link to fail. -Event Log Message Level: The driver uses the message level flag to log events - to syslog. The message level can be set at driver load time. It can also be - set using the command +Event Log Message Level: The driver uses the message level flag to log events + to syslog. The message level can be set at driver load time. It can also be + set using the command: ethtool -s eth? msglvl n + Additional Configurations ========================= Configuring the Driver on Different Distributions ------------------------------------------------- - Configuring a network driver to load properly when the system is started is - distribution dependent. Typically, the configuration process involves adding - an alias line to /etc/modules.conf as well as editing other system startup - scripts and/or configuration files. Many popular Linux distributions ship - with tools to make these changes for you. To learn the proper way to - configure a network device for your system, refer to your distribution - documentation. If during this process you are asked for the driver or module - name, the name for the Linux Base Driver for the Intel PRO/100 Family of - Adapters is e100. + Configuring a network driver to load properly when the system is started is + distribution dependent. Typically, the configuration process involves adding + an alias line to /etc/modules.conf or /etc/modprobe.conf as well as editing + other system startup scripts and/or configuration files. Many popular Linux + distributions ship with tools to make these changes for you. To learn the + proper way to configure a network device for your system, refer to your + distribution documentation. If during this process you are asked for the + driver or module name, the name for the Linux Base Driver for the Intel + PRO/100 Family of Adapters is e100. - As an example, if you install the e100 driver for two PRO/100 adapters - (eth0 and eth1), add the following to modules.conf: + As an example, if you install the e100 driver for two PRO/100 adapters + (eth0 and eth1), add the following to modules.conf or modprobe.conf: alias eth0 e100 alias eth1 e100 Viewing Link Messages --------------------- - In order to see link messages and other Intel driver information on your - console, you must set the dmesg level up to six. This can be done by - entering the following on the command line before loading the e100 driver: + In order to see link messages and other Intel driver information on your + console, you must set the dmesg level up to six. This can be done by + entering the following on the command line before loading the e100 driver: dmesg -n 8 - If you wish to see all messages issued by the driver, including debug + If you wish to see all messages issued by the driver, including debug messages, set the dmesg level to eight. NOTE: This setting is not saved across reboots. + Ethtool ------- @@ -114,29 +129,27 @@ Additional Configurations diagnostics, as well as displaying statistical information. Ethtool version 1.6 or later is required for this functionality. - The latest release of ethtool can be found at: - http://sf.net/projects/gkernel. + The latest release of ethtool can be found from + http://sourceforge.net/projects/gkernel. - NOTE: This driver uses mii support from the kernel. As a result, when - there is no link, ethtool will report speed/duplex to be 10/half. + NOTE: Ethtool 1.6 only supports a limited set of ethtool options. Support + for a more complete ethtool feature set can be enabled by upgrading + ethtool to ethtool-1.8.1. - NOTE: Ethtool 1.6 only supports a limited set of ethtool options. Support - for a more complete ethtool feature set can be enabled by upgrading - ethtool to ethtool-1.8.1. Enabling Wake on LAN* (WoL) --------------------------- - WoL is provided through the Ethtool* utility. Ethtool is included with Red - Hat* 8.0. For other Linux distributions, download and install Ethtool from - the following website: http://sourceforge.net/projects/gkernel. + WoL is provided through the Ethtool* utility. Ethtool is included with Red + Hat* 8.0. For other Linux distributions, download and install Ethtool from + the following website: http://sourceforge.net/projects/gkernel. - For instructions on enabling WoL with Ethtool, refer to the Ethtool man - page. + For instructions on enabling WoL with Ethtool, refer to the Ethtool man page. WoL will be enabled on the system during the next shut down or reboot. For - this driver version, in order to enable WoL, the e100 driver must be + this driver version, in order to enable WoL, the e100 driver must be loaded when shutting down or rebooting the system. + NAPI ---- @@ -144,6 +157,25 @@ Additional Configurations See www.cyberus.ca/~hadi/usenix-paper.tgz for more information on NAPI. + Multiple Interfaces on Same Ethernet Broadcast Network + ------------------------------------------------------ + + Due to the default ARP behavior on Linux, it is not possible to have + one system on two IP networks in the same Ethernet broadcast domain + (non-partitioned switch) behave as expected. All Ethernet interfaces + will respond to IP traffic for any IP address assigned to the system. + This results in unbalanced receive traffic. + + If you have multiple interfaces in a server, either turn on ARP + filtering by + + (1) entering: echo 1 > /proc/sys/net/ipv4/conf/all/arp_filter + (this only works if your kernel's version is higher than 2.4.5), or + + (2) installing the interfaces in separate broadcast domains (either + in different switches or in a switch partitioned to VLANs). + + Support ======= @@ -151,20 +183,24 @@ For general information, go to the Intel support website at: http://support.intel.com + or the Intel Wired Networking project hosted by Sourceforge at: + + http://sourceforge.net/projects/e1000 + If an issue is identified with the released source code on the supported -kernel with a supported adapter, email the specific information related to -the issue to linux.nics@intel.com. +kernel with a supported adapter, email the specific information related to the +issue to e1000-devel@lists.sourceforge.net. License ======= -This software program is released under the terms of a license agreement -between you ('Licensee') and Intel. Do not use or load this software or any -associated materials (collectively, the 'Software') until you have carefully -read the full terms and conditions of the LICENSE located in this software -package. By loading or using the Software, you agree to the terms of this -Agreement. If you do not agree with the terms of this Agreement, do not -install or use the Software. +This software program is released under the terms of a license agreement +between you ('Licensee') and Intel. Do not use or load this software or any +associated materials (collectively, the 'Software') until you have carefully +read the full terms and conditions of the file COPYING located in this software +package. By loading or using the Software, you agree to the terms of this +Agreement. If you do not agree with the terms of this Agreement, do not install +or use the Software. * Other names and brands may be claimed as the property of others. diff --git a/Documentation/networking/e1000.txt b/Documentation/networking/e1000.txt index 2ebd4058d46..71fe15af356 100644 --- a/Documentation/networking/e1000.txt +++ b/Documentation/networking/e1000.txt @@ -1,7 +1,7 @@ Linux* Base Driver for the Intel(R) PRO/1000 Family of Adapters =============================================================== -November 17, 2004 +November 15, 2005 Contents @@ -20,254 +20,316 @@ In This Release =============== This file describes the Linux* Base Driver for the Intel(R) PRO/1000 Family -of Adapters, version 5.x.x. +of Adapters. This driver includes support for Itanium(R)2-based systems. -For questions related to hardware requirements, refer to the documentation -supplied with your Intel PRO/1000 adapter. All hardware requirements listed +For questions related to hardware requirements, refer to the documentation +supplied with your Intel PRO/1000 adapter. All hardware requirements listed apply to use with Linux. -Native VLANs are now available with supported kernels. +The following features are now available in supported kernels: + - Native VLANs + - Channel Bonding (teaming) + - SNMP + +Channel Bonding documentation can be found in the Linux kernel source: +/Documentation/networking/bonding.txt + +The driver information previously displayed in the /proc filesystem is not +supported in this release. Alternatively, you can use ethtool (version 1.6 +or later), lspci, and ifconfig to obtain the same information. + +Instructions on updating ethtool can be found in the section "Additional +Configurations" later in this document. + Identifying Your Adapter ======================== -For more information on how to identify your adapter, go to the Adapter & +For more information on how to identify your adapter, go to the Adapter & Driver ID Guide at: http://support.intel.com/support/network/adapter/pro100/21397.htm -For the latest Intel network drivers for Linux, refer to the following -website. In the search field, enter your adapter name or type, or use the +For the latest Intel network drivers for Linux, refer to the following +website. In the search field, enter your adapter name or type, or use the networking link on the left to search for your adapter: http://downloadfinder.intel.com/scripts-df/support_intel.asp -Command Line Parameters -======================= -If the driver is built as a module, the following optional parameters are -used by entering them on the command line with the modprobe or insmod command -using this syntax: +Command Line Parameters ======================= + +If the driver is built as a module, the following optional parameters +are used by entering them on the command line with the modprobe or insmod +command using this syntax: modprobe e1000 [<option>=<VAL1>,<VAL2>,...] - insmod e1000 [<option>=<VAL1>,<VAL2>,...] + insmod e1000 [<option>=<VAL1>,<VAL2>,...] For example, with two PRO/1000 PCI adapters, entering: insmod e1000 TxDescriptors=80,128 -loads the e1000 driver with 80 TX descriptors for the first adapter and 128 TX -descriptors for the second adapter. +loads the e1000 driver with 80 TX descriptors for the first adapter and 128 +TX descriptors for the second adapter. The default value for each parameter is generally the recommended setting, -unless otherwise noted. Also, if the driver is statically built into the -kernel, the driver is loaded with the default values for all the parameters. -Ethtool can be used to change some of the parameters at runtime. +unless otherwise noted. + +NOTES: For more information about the AutoNeg, Duplex, and Speed + parameters, see the "Speed and Duplex Configuration" section in + this document. - NOTES: For more information about the AutoNeg, Duplex, and Speed - parameters, see the "Speed and Duplex Configuration" section in - this document. + For more information about the InterruptThrottleRate, + RxIntDelay, TxIntDelay, RxAbsIntDelay, and TxAbsIntDelay + parameters, see the application note at: + http://www.intel.com/design/network/applnots/ap450.htm - For more information about the InterruptThrottleRate, RxIntDelay, - TxIntDelay, RxAbsIntDelay, and TxAbsIntDelay parameters, see the - application note at: - http://www.intel.com/design/network/applnots/ap450.htm + A descriptor describes a data buffer and attributes related to + the data buffer. This information is accessed by the hardware. - A descriptor describes a data buffer and attributes related to the - data buffer. This information is accessed by the hardware. -AutoNeg (adapters using copper connections only) -Valid Range: 0x01-0x0F, 0x20-0x2F +AutoNeg +------- +(Supported only on adapters with copper connections) +Valid Range: 0x01-0x0F, 0x20-0x2F Default Value: 0x2F - This parameter is a bit mask that specifies which speed and duplex - settings the board advertises. When this parameter is used, the Speed and - Duplex parameters must not be specified. - NOTE: Refer to the Speed and Duplex section of this readme for more - information on the AutoNeg parameter. - -Duplex (adapters using copper connections only) -Valid Range: 0-2 (0=auto-negotiate, 1=half, 2=full) + +This parameter is a bit mask that specifies which speed and duplex +settings the board advertises. When this parameter is used, the Speed +and Duplex parameters must not be specified. + +NOTE: Refer to the Speed and Duplex section of this readme for more + information on the AutoNeg parameter. + + +Duplex +------ +(Supported only on adapters with copper connections) +Valid Range: 0-2 (0=auto-negotiate, 1=half, 2=full) Default Value: 0 - Defines the direction in which data is allowed to flow. Can be either one - or two-directional. If both Duplex and the link partner are set to auto- - negotiate, the board auto-detects the correct duplex. If the link partner - is forced (either full or half), Duplex defaults to half-duplex. + +Defines the direction in which data is allowed to flow. Can be either +one or two-directional. If both Duplex and the link partner are set to +auto-negotiate, the board auto-detects the correct duplex. If the link +partner is forced (either full or half), Duplex defaults to half-duplex. + FlowControl -Valid Range: 0-3 (0=none, 1=Rx only, 2=Tx only, 3=Rx&Tx) -Default: Read flow control settings from the EEPROM - This parameter controls the automatic generation(Tx) and response(Rx) to - Ethernet PAUSE frames. +---------- +Valid Range: 0-3 (0=none, 1=Rx only, 2=Tx only, 3=Rx&Tx) +Default Value: Reads flow control settings from the EEPROM + +This parameter controls the automatic generation(Tx) and response(Rx) +to Ethernet PAUSE frames. + InterruptThrottleRate -Valid Range: 100-100000 (0=off, 1=dynamic) +--------------------- +(not supported on Intel 82542, 82543 or 82544-based adapters) +Valid Range: 100-100000 (0=off, 1=dynamic) Default Value: 8000 - This value represents the maximum number of interrupts per second the - controller generates. InterruptThrottleRate is another setting used in - interrupt moderation. Dynamic mode uses a heuristic algorithm to adjust - InterruptThrottleRate based on the current traffic load. -Un-supported Adapters: InterruptThrottleRate is NOT supported by 82542, 82543 - or 82544-based adapters. - - NOTE: InterruptThrottleRate takes precedence over the TxAbsIntDelay and - RxAbsIntDelay parameters. In other words, minimizing the receive - and/or transmit absolute delays does not force the controller to - generate more interrupts than what the Interrupt Throttle Rate - allows. - CAUTION: If you are using the Intel PRO/1000 CT Network Connection - (controller 82547), setting InterruptThrottleRate to a value - greater than 75,000, may hang (stop transmitting) adapters under - certain network conditions. If this occurs a NETDEV WATCHDOG - message is logged in the system event log. In addition, the - controller is automatically reset, restoring the network - connection. To eliminate the potential for the hang, ensure - that InterruptThrottleRate is set no greater than 75,000 and is - not set to 0. - NOTE: When e1000 is loaded with default settings and multiple adapters are - in use simultaneously, the CPU utilization may increase non-linearly. - In order to limit the CPU utilization without impacting the overall - throughput, we recommend that you load the driver as follows: - - insmod e1000.o InterruptThrottleRate=3000,3000,3000 - - This sets the InterruptThrottleRate to 3000 interrupts/sec for the - first, second, and third instances of the driver. The range of 2000 to - 3000 interrupts per second works on a majority of systems and is a - good starting point, but the optimal value will be platform-specific. - If CPU utilization is not a concern, use RX_POLLING (NAPI) and default - driver settings. + +This value represents the maximum number of interrupts per second the +controller generates. InterruptThrottleRate is another setting used in +interrupt moderation. Dynamic mode uses a heuristic algorithm to adjust +InterruptThrottleRate based on the current traffic load. + +NOTE: InterruptThrottleRate takes precedence over the TxAbsIntDelay and + RxAbsIntDelay parameters. In other words, minimizing the receive + and/or transmit absolute delays does not force the controller to + generate more interrupts than what the Interrupt Throttle Rate + allows. + +CAUTION: If you are using the Intel PRO/1000 CT Network Connection + (controller 82547), setting InterruptThrottleRate to a value + greater than 75,000, may hang (stop transmitting) adapters + under certain network conditions. If this occurs a NETDEV + WATCHDOG message is logged in the system event log. In + addition, the controller is automatically reset, restoring + the network connection. To eliminate the potential for the + hang, ensure that InterruptThrottleRate is set no greater + than 75,000 and is not set to 0. + +NOTE: When e1000 is loaded with default settings and multiple adapters + are in use simultaneously, the CPU utilization may increase non- + linearly. In order to limit the CPU utilization without impacting + the overall throughput, we recommend that you load the driver as + follows: + + insmod e1000.o InterruptThrottleRate=3000,3000,3000 + + This sets the InterruptThrottleRate to 3000 interrupts/sec for + the first, second, and third instances of the driver. The range + of 2000 to 3000 interrupts per second works on a majority of + systems and is a good starting point, but the optimal value will + be platform-specific. If CPU utilization is not a concern, use + RX_POLLING (NAPI) and default driver settings. + RxDescriptors -Valid Range: 80-256 for 82542 and 82543-based adapters - 80-4096 for all other supported adapters +------------- +Valid Range: 80-256 for 82542 and 82543-based adapters + 80-4096 for all other supported adapters Default Value: 256 - This value is the number of receive descriptors allocated by the driver. - Increasing this value allows the driver to buffer more incoming packets. - Each descriptor is 16 bytes. A receive buffer is allocated for each - descriptor and can either be 2048 or 4096 bytes long, depending on the MTU - setting. An incoming packet can span one or more receive descriptors. - The maximum MTU size is 16110. +This value specifies the number of receive descriptors allocated by the +driver. Increasing this value allows the driver to buffer more incoming +packets. Each descriptor is 16 bytes. A receive buffer is also +allocated for each descriptor and is 2048. - NOTE: MTU designates the frame size. It only needs to be set for Jumbo - Frames. - NOTE: Depending on the available system resources, the request for a - higher number of receive descriptors may be denied. In this case, - use a lower number. RxIntDelay -Valid Range: 0-65535 (0=off) +---------- +Valid Range: 0-65535 (0=off) Default Value: 0 - This value delays the generation of receive interrupts in units of 1.024 - microseconds. Receive interrupt reduction can improve CPU efficiency if - properly tuned for specific network traffic. Increasing this value adds - extra latency to frame reception and can end up decreasing the throughput - of TCP traffic. If the system is reporting dropped receives, this value - may be set too high, causing the driver to run out of available receive - descriptors. - - CAUTION: When setting RxIntDelay to a value other than 0, adapters may - hang (stop transmitting) under certain network conditions. If - this occurs a NETDEV WATCHDOG message is logged in the system - event log. In addition, the controller is automatically reset, - restoring the network connection. To eliminate the potential for - the hang ensure that RxIntDelay is set to 0. - -RxAbsIntDelay (82540, 82545 and later adapters only) -Valid Range: 0-65535 (0=off) + +This value delays the generation of receive interrupts in units of 1.024 +microseconds. Receive interrupt reduction can improve CPU efficiency if +properly tuned for specific network traffic. Increasing this value adds +extra latency to frame reception and can end up decreasing the throughput +of TCP traffic. If the system is reporting dropped receives, this value +may be set too high, causing the driver to run out of available receive +descriptors. + +CAUTION: When setting RxIntDelay to a value other than 0, adapters may + hang (stop transmitting) under certain network conditions. If + this occurs a NETDEV WATCHDOG message is logged in the system + event log. In addition, the controller is automatically reset, + restoring the network connection. To eliminate the potential + for the hang ensure that RxIntDelay is set to 0. + + +RxAbsIntDelay +------------- +(This parameter is supported only on 82540, 82545 and later adapters.) +Valid Range: 0-65535 (0=off) Default Value: 128 - This value, in units of 1.024 microseconds, limits the delay in which a - receive interrupt is generated. Useful only if RxIntDelay is non-zero, - this value ensures that an interrupt is generated after the initial - packet is received within the set amount of time. Proper tuning, - along with RxIntDelay, may improve traffic throughput in specific network - conditions. - -Speed (adapters using copper connections only) + +This value, in units of 1.024 microseconds, limits the delay in which a +receive interrupt is generated. Useful only if RxIntDelay is non-zero, +this value ensures that an interrupt is generated after the initial +packet is received within the set amount of time. Proper tuning, +along with RxIntDelay, may improve traffic throughput in specific network +conditions. + + +Speed +----- +(This parameter is supported only on adapters with copper connections.) Valid Settings: 0, 10, 100, 1000 -Default Value: 0 (auto-negotiate at all supported speeds) - Speed forces the line speed to the specified value in megabits per second - (Mbps). If this parameter is not specified or is set to 0 and the link - partner is set to auto-negotiate, the board will auto-detect the correct - speed. Duplex should also be set when Speed is set to either 10 or 100. +Default Value: 0 (auto-negotiate at all supported speeds) + +Speed forces the line speed to the specified value in megabits per second +(Mbps). If this parameter is not specified or is set to 0 and the link +partner is set to auto-negotiate, the board will auto-detect the correct +speed. Duplex should also be set when Speed is set to either 10 or 100. + TxDescriptors -Valid Range: 80-256 for 82542 and 82543-based adapters - 80-4096 for all other supported adapters +------------- +Valid Range: 80-256 for 82542 and 82543-based adapters + 80-4096 for all other supported adapters Default Value: 256 - This value is the number of transmit descriptors allocated by the driver. - Increasing this value allows the driver to queue more transmits. Each - descriptor is 16 bytes. - NOTE: Depending on the available system resources, the request for a - higher number of transmit descriptors may be denied. In this case, - use a lower number. +This value is the number of transmit descriptors allocated by the driver. +Increasing this value allows the driver to queue more transmits. Each +descriptor is 16 bytes. + +NOTE: Depending on the available system resources, the request for a + higher number of transmit descriptors may be denied. In this case, + use a lower number. + TxIntDelay -Valid Range: 0-65535 (0=off) +---------- +Valid Range: 0-65535 (0=off) Default Value: 64 - This value delays the generation of transmit interrupts in units of - 1.024 microseconds. Transmit interrupt reduction can improve CPU - efficiency if properly tuned for specific network traffic. If the - system is reporting dropped transmits, this value may be set too high - causing the driver to run out of available transmit descriptors. - -TxAbsIntDelay (82540, 82545 and later adapters only) -Valid Range: 0-65535 (0=off) + +This value delays the generation of transmit interrupts in units of +1.024 microseconds. Transmit interrupt reduction can improve CPU +efficiency if properly tuned for specific network traffic. If the +system is reporting dropped transmits, this value may be set too high +causing the driver to run out of available transmit descriptors. + + +TxAbsIntDelay +------------- +(This parameter is supported only on 82540, 82545 and later adapters.) +Valid Range: 0-65535 (0=off) Default Value: 64 - This value, in units of 1.024 microseconds, limits the delay in which a - transmit interrupt is generated. Useful only if TxIntDelay is non-zero, - this value ensures that an interrupt is generated after the initial - packet is sent on the wire within the set amount of time. Proper tuning, - along with TxIntDelay, may improve traffic throughput in specific - network conditions. - -XsumRX (not available on the 82542-based adapter) -Valid Range: 0-1 + +This value, in units of 1.024 microseconds, limits the delay in which a +transmit interrupt is generated. Useful only if TxIntDelay is non-zero, +this value ensures that an interrupt is generated after the initial +packet is sent on the wire within the set amount of time. Proper tuning, +along with TxIntDelay, may improve traffic throughput in specific +network conditions. + +XsumRX +------ +(This parameter is NOT supported on the 82542-based adapter.) +Valid Range: 0-1 Default Value: 1 - A value of '1' indicates that the driver should enable IP checksum - offload for received packets (both UDP and TCP) to the adapter hardware. + +A value of '1' indicates that the driver should enable IP checksum +offload for received packets (both UDP and TCP) to the adapter hardware. + Speed and Duplex Configuration ============================== -Three keywords are used to control the speed and duplex configuration. These -keywords are Speed, Duplex, and AutoNeg. +Three keywords are used to control the speed and duplex configuration. +These keywords are Speed, Duplex, and AutoNeg. -If the board uses a fiber interface, these keywords are ignored, and the +If the board uses a fiber interface, these keywords are ignored, and the fiber interface board only links at 1000 Mbps full-duplex. For copper-based boards, the keywords interact as follows: - The default operation is auto-negotiate. The board advertises all supported - speed and duplex combinations, and it links at the highest common speed and - duplex mode IF the link partner is set to auto-negotiate. + The default operation is auto-negotiate. The board advertises all + supported speed and duplex combinations, and it links at the highest + common speed and duplex mode IF the link partner is set to auto-negotiate. - If Speed = 1000, limited auto-negotiation is enabled and only 1000 Mbps is - advertised (The 1000BaseT spec requires auto-negotiation.) + If Speed = 1000, limited auto-negotiation is enabled and only 1000 Mbps + is advertised (The 1000BaseT spec requires auto-negotiation.) If Speed = 10 or 100, then both Speed and Duplex should be set. Auto- - negotiation is disabled, and the AutoNeg parameter is ignored. Partner SHOULD - also be forced. + negotiation is disabled, and the AutoNeg parameter is ignored. Partner + SHOULD also be forced. + +The AutoNeg parameter is used when more control is required over the +auto-negotiation process. It should be used when you wish to control which +speed and duplex combinations are advertised during the auto-negotiation +process. + +The parameter may be specified as either a decimal or hexidecimal value as +determined by the bitmap below. -The AutoNeg parameter is used when more control is required over the auto- -negotiation process. When this parameter is used, Speed and Duplex parameters -must not be specified. The following table describes supported values for the -AutoNeg parameter: +Bit position 7 6 5 4 3 2 1 0 +Decimal Value 128 64 32 16 8 4 2 1 +Hex value 80 40 20 10 8 4 2 1 +Speed (Mbps) N/A N/A 1000 N/A 100 100 10 10 +Duplex Full Full Half Full Half -Speed (Mbps) 1000 100 100 10 10 -Duplex Full Full Half Full Half -Value (in base 16) 0x20 0x08 0x04 0x02 0x01 +Some examples of using AutoNeg: -Example: insmod e1000 AutoNeg=0x03, loads e1000 and specifies (10 full duplex, -10 half duplex) for negotiation with the peer. + modprobe e1000 AutoNeg=0x01 (Restricts autonegotiation to 10 Half) + modprobe e1000 AutoNeg=1 (Same as above) + modprobe e1000 AutoNeg=0x02 (Restricts autonegotiation to 10 Full) + modprobe e1000 AutoNeg=0x03 (Restricts autonegotiation to 10 Half or 10 Full) + modprobe e1000 AutoNeg=0x04 (Restricts autonegotiation to 100 Half) + modprobe e1000 AutoNeg=0x05 (Restricts autonegotiation to 10 Half or 100 + Half) + modprobe e1000 AutoNeg=0x020 (Restricts autonegotiation to 1000 Full) + modprobe e1000 AutoNeg=32 (Same as above) -Note that setting AutoNeg does not guarantee that the board will link at the -highest specified speed or duplex mode, but the board will link at the -highest possible speed/duplex of the link partner IF the link partner is also -set to auto-negotiate. If the link partner is forced speed/duplex, the -adapter MUST be forced to the same speed/duplex. +Note that when this parameter is used, Speed and Duplex must not be specified. + +If the link partner is forced to a specific speed and duplex, then this +parameter should not be used. Instead, use the Speed and Duplex parameters +previously mentioned to force the adapter to the same speed and duplex. Additional Configurations @@ -276,19 +338,19 @@ Additional Configurations Configuring the Driver on Different Distributions ------------------------------------------------- - Configuring a network driver to load properly when the system is started is - distribution dependent. Typically, the configuration process involves adding - an alias line to /etc/modules.conf as well as editing other system startup - scripts and/or configuration files. Many popular Linux distributions ship - with tools to make these changes for you. To learn the proper way to - configure a network device for your system, refer to your distribution - documentation. If during this process you are asked for the driver or module - name, the name for the Linux Base Driver for the Intel PRO/1000 Family of - Adapters is e1000. + Configuring a network driver to load properly when the system is started + is distribution dependent. Typically, the configuration process involves + adding an alias line to /etc/modules.conf or /etc/modprobe.conf as well + as editing other system startup scripts and/or configuration files. Many + popular Linux distributions ship with tools to make these changes for you. + To learn the proper way to configure a network device for your system, + refer to your distribution documentation. If during this process you are + asked for the driver or module name, the name for the Linux Base Driver + for the Intel PRO/1000 Family of Adapters is e1000. - As an example, if you install the e1000 driver for two PRO/1000 adapters - (eth0 and eth1) and set the speed and duplex to 10full and 100half, add the - following to modules.conf: + As an example, if you install the e1000 driver for two PRO/1000 adapters + (eth0 and eth1) and set the speed and duplex to 10full and 100half, add + the following to modules.conf or or modprobe.conf: alias eth0 e1000 alias eth1 e1000 @@ -297,9 +359,9 @@ Additional Configurations Viewing Link Messages --------------------- - Link messages will not be displayed to the console if the distribution is - restricting system messages. In order to see network driver link messages on - your console, set dmesg to eight by entering the following: + Link messages will not be displayed to the console if the distribution is + restricting system messages. In order to see network driver link messages + on your console, set dmesg to eight by entering the following: dmesg -n 8 @@ -308,22 +370,42 @@ Additional Configurations Jumbo Frames ------------ - The driver supports Jumbo Frames for all adapters except 82542-based - adapters. Jumbo Frames support is enabled by changing the MTU to a value - larger than the default of 1500. Use the ifconfig command to increase the - MTU size. For example: + The driver supports Jumbo Frames for all adapters except 82542 and + 82573-based adapters. Jumbo Frames support is enabled by changing the + MTU to a value larger than the default of 1500. Use the ifconfig command + to increase the MTU size. For example: + + ifconfig eth<x> mtu 9000 up + + This setting is not saved across reboots. It can be made permanent if + you add: + + MTU=9000 - ifconfig ethx mtu 9000 up + to the file /etc/sysconfig/network-scripts/ifcfg-eth<x>. This example + applies to the Red Hat distributions; other distributions may store this + setting in a different location. - The maximum MTU setting for Jumbo Frames is 16110. This value coincides - with the maximum Jumbo Frames size of 16128. + Notes: - NOTE: Jumbo Frames are supported at 1000 Mbps only. Using Jumbo Frames at - 10 or 100 Mbps may result in poor performance or loss of link. + - To enable Jumbo Frames, increase the MTU size on the interface beyond + 1500. + - The maximum MTU setting for Jumbo Frames is 16110. This value coincides + with the maximum Jumbo Frames size of 16128. + - Using Jumbo Frames at 10 or 100 Mbps may result in poor performance or + loss of link. + - Some Intel gigabit adapters that support Jumbo Frames have a frame size + limit of 9238 bytes, with a corresponding MTU size limit of 9216 bytes. + The adapters with this limitation are based on the Intel 82571EB and + 82572EI controllers, which correspond to these product names: + Intel� PRO/1000 PT Dual Port Server Adapter + Intel� PRO/1000 PF Dual Port Server Adapter + Intel� PRO/1000 PT Server Adapter + Intel� PRO/1000 PT Desktop Adapter + Intel� PRO/1000 PF Server Adapter + - The Intel PRO/1000 PM Network Connection does not support jumbo frames. - NOTE: MTU designates the frame size. To enable Jumbo Frames, increase the - MTU size on the interface beyond 1500. Ethtool ------- @@ -333,32 +415,41 @@ Additional Configurations version 1.6 or later is required for this functionality. The latest release of ethtool can be found from - http://sf.net/projects/gkernel. + http://sourceforge.net/projects/gkernel. - NOTE: Ethtool 1.6 only supports a limited set of ethtool options. Support - for a more complete ethtool feature set can be enabled by upgrading - ethtool to ethtool-1.8.1. + NOTE: Ethtool 1.6 only supports a limited set of ethtool options. Support + for a more complete ethtool feature set can be enabled by upgrading + ethtool to ethtool-1.8.1. Enabling Wake on LAN* (WoL) --------------------------- WoL is configured through the Ethtool* utility. Ethtool is included with - all versions of Red Hat after Red Hat 7.2. For other Linux distributions, - download and install Ethtool from the following website: + all versions of Red Hat after Red Hat 7.2. For other Linux distributions, + download and install Ethtool from the following website: http://sourceforge.net/projects/gkernel. - For instructions on enabling WoL with Ethtool, refer to the website listed + For instructions on enabling WoL with Ethtool, refer to the website listed above. - WoL will be enabled on the system during the next shut down or reboot. - For this driver version, in order to enable WoL, the e1000 driver must be + WoL will be enabled on the system during the next shut down or reboot. + For this driver version, in order to enable WoL, the e1000 driver must be loaded when shutting down or rebooting the system. NAPI ---- NAPI (Rx polling mode) is supported in the e1000 driver. NAPI is enabled - or disabled based on the configuration of the kernel. + or disabled based on the configuration of the kernel. To override + the default, use the following compile-time flags. + + To enable NAPI, compile the driver module, passing in a configuration option: + + make CFLAGS_EXTRA=-DE1000_NAPI install + + To disable NAPI, compile the driver module, passing in a configuration option: + + make CFLAGS_EXTRA=-DE1000_NO_NAPI install See www.cyberus.ca/~hadi/usenix-paper.tgz for more information on NAPI. @@ -369,10 +460,85 @@ Known Issues Jumbo Frames System Requirement ------------------------------- - Memory allocation failures have been observed on Linux systems with 64 MB - of RAM or less that are running Jumbo Frames. If you are using Jumbo Frames, - your system may require more than the advertised minimum requirement of 64 MB - of system memory. + Memory allocation failures have been observed on Linux systems with 64 MB + of RAM or less that are running Jumbo Frames. If you are using Jumbo + Frames, your system may require more than the advertised minimum + requirement of 64 MB of system memory. + + Performance Degradation with Jumbo Frames + ----------------------------------------- + + Degradation in throughput performance may be observed in some Jumbo frames + environments. If this is observed, increasing the application's socket + buffer size and/or increasing the /proc/sys/net/ipv4/tcp_*mem entry values + may help. See the specific application manual and + /usr/src/linux*/Documentation/ + networking/ip-sysctl.txt for more details. + + Jumbo frames on Foundry BigIron 8000 switch + ------------------------------------------- + There is a known issue using Jumbo frames when connected to a Foundry + BigIron 8000 switch. This is a 3rd party limitation. If you experience + loss of packets, lower the MTU size. + + Multiple Interfaces on Same Ethernet Broadcast Network + ------------------------------------------------------ + + Due to the default ARP behavior on Linux, it is not possible to have + one system on two IP networks in the same Ethernet broadcast domain + (non-partitioned switch) behave as expected. All Ethernet interfaces + will respond to IP traffic for any IP address assigned to the system. + This results in unbalanced receive traffic. + + If you have multiple interfaces in a server, either turn on ARP + filtering by entering: + + echo 1 > /proc/sys/net/ipv4/conf/all/arp_filter + (this only works if your kernel's version is higher than 2.4.5), + + NOTE: This setting is not saved across reboots. The configuration + change can be made permanent by adding the line: + net.ipv4.conf.all.arp_filter = 1 + to the file /etc/sysctl.conf + + or, + + install the interfaces in separate broadcast domains (either in + different switches or in a switch partitioned to VLANs). + + 82541/82547 can't link or are slow to link with some link partners + ----------------------------------------------------------------- + + There is a known compatibility issue with 82541/82547 and some + low-end switches where the link will not be established, or will + be slow to establish. In particular, these switches are known to + be incompatible with 82541/82547: + + Planex FXG-08TE + I-O Data ETG-SH8 + + To workaround this issue, the driver can be compiled with an override + of the PHY's master/slave setting. Forcing master or forcing slave + mode will improve time-to-link. + + # make EXTRA_CFLAGS=-DE1000_MASTER_SLAVE=<n> + + Where <n> is: + + 0 = Hardware default + 1 = Master mode + 2 = Slave mode + 3 = Auto master/slave + + Disable rx flow control with ethtool + ------------------------------------ + + In order to disable receive flow control using ethtool, you must turn + off auto-negotiation on the same command line. + + For example: + + ethtool -A eth? autoneg off rx off Support @@ -382,20 +548,24 @@ For general information, go to the Intel support website at: http://support.intel.com + or the Intel Wired Networking project hosted by Sourceforge at: + + http://sourceforge.net/projects/e1000 + If an issue is identified with the released source code on the supported -kernel with a supported adapter, email the specific information related to -the issue to linux.nics@intel.com. +kernel with a supported adapter, email the specific information related +to the issue to e1000-devel@lists.sourceforge.net License ======= -This software program is released under the terms of a license agreement -between you ('Licensee') and Intel. Do not use or load this software or any -associated materials (collectively, the 'Software') until you have carefully -read the full terms and conditions of the LICENSE located in this software -package. By loading or using the Software, you agree to the terms of this -Agreement. If you do not agree with the terms of this Agreement, do not +This software program is released under the terms of a license agreement +between you ('Licensee') and Intel. Do not use or load this software or any +associated materials (collectively, the 'Software') until you have carefully +read the full terms and conditions of the file COPYING located in this software +package. By loading or using the Software, you agree to the terms of this +Agreement. If you do not agree with the terms of this Agreement, do not install or use the Software. * Other names and brands may be claimed as the property of others. diff --git a/MAINTAINERS b/MAINTAINERS index b0dc75a5e74..dd1351dc32b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1349,10 +1349,10 @@ S: Maintained INTEL PRO/100 ETHERNET SUPPORT P: John Ronciak M: john.ronciak@intel.com -P: Ganesh Venkatesan -M: ganesh.venkatesan@intel.com P: Jesse Brandeburg M: jesse.brandeburg@intel.com +P: Jeff Kirsher +M: jeffrey.t.kirsher@intel.com W: http://sourceforge.net/projects/e1000/ S: Supported @@ -1361,18 +1361,22 @@ P: Jeb Cramer M: cramerj@intel.com P: John Ronciak M: john.ronciak@intel.com -P: Ganesh Venkatesan -M: ganesh.venkatesan@intel.com +P: Jesse Brandeburg +M: jesse.brandeburg@intel.com +P: Jeff Kirsher +M: jeffrey.t.kirsher@intel.com W: http://sourceforge.net/projects/e1000/ S: Supported INTEL PRO/10GbE SUPPORT +P: Jeff Kirsher +M: jeffrey.t.kirsher@intel.com P: Ayyappan Veeraiyan M: ayyappan.veeraiyan@intel.com -P: Ganesh Venkatesan -M: ganesh.venkatesan@intel.com P: John Ronciak M: john.ronciak@intel.com +P: Jesse Brandeburg +M: jesse.brandeburg@intel.com W: http://sourceforge.net/projects/e1000/ S: Supported diff --git a/arch/alpha/mm/init.c b/arch/alpha/mm/init.c index 486d7945583..544ac5dc09e 100644 --- a/arch/alpha/mm/init.c +++ b/arch/alpha/mm/init.c @@ -357,7 +357,7 @@ free_reserved_mem(void *start, void *end) void *__start = start; for (; __start < end; __start += PAGE_SIZE) { ClearPageReserved(virt_to_page(__start)); - set_page_count(virt_to_page(__start), 1); + init_page_count(virt_to_page(__start)); free_page((long)__start); totalram_pages++; } diff --git a/arch/arm/mm/consistent.c b/arch/arm/mm/consistent.c index c2ee18d2075..8a1bfcd5008 100644 --- a/arch/arm/mm/consistent.c +++ b/arch/arm/mm/consistent.c @@ -223,6 +223,8 @@ __dma_alloc(struct device *dev, size_t size, dma_addr_t *handle, gfp_t gfp, pte = consistent_pte[idx] + off; c->vm_pages = page; + split_page(page, order); + /* * Set the "dma handle" */ @@ -231,7 +233,6 @@ __dma_alloc(struct device *dev, size_t size, dma_addr_t *handle, gfp_t gfp, do { BUG_ON(!pte_none(*pte)); - set_page_count(page, 1); /* * x86 does not mark the pages reserved... */ @@ -250,7 +251,6 @@ __dma_alloc(struct device *dev, size_t size, dma_addr_t *handle, gfp_t gfp, * Free the otherwise unused pages. */ while (page < end) { - set_page_count(page, 1); __free_page(page); page++; } diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c index 8b276ee38ac..b0321e943b7 100644 --- a/arch/arm/mm/init.c +++ b/arch/arm/mm/init.c @@ -531,7 +531,7 @@ static inline void free_area(unsigned long addr, unsigned long end, char *s) for (; addr < end; addr += PAGE_SIZE) { struct page *page = virt_to_page(addr); ClearPageReserved(page); - set_page_count(page, 1); + init_page_count(page); free_page(addr); totalram_pages++; } diff --git a/arch/arm26/mm/init.c b/arch/arm26/mm/init.c index 1f09a9d0fb8..e3ecaa45374 100644 --- a/arch/arm26/mm/init.c +++ b/arch/arm26/mm/init.c @@ -324,7 +324,7 @@ static inline void free_area(unsigned long addr, unsigned long end, char *s) for (; addr < end; addr += PAGE_SIZE) { struct page *page = virt_to_page(addr); ClearPageReserved(page); - set_page_count(page, 1); + init_page_count(page); free_page(addr); totalram_pages++; } diff --git a/arch/cris/mm/init.c b/arch/cris/mm/init.c index 31a0018b525..b7842ff213a 100644 --- a/arch/cris/mm/init.c +++ b/arch/cris/mm/init.c @@ -216,7 +216,7 @@ free_initmem(void) addr = (unsigned long)(&__init_begin); for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) { ClearPageReserved(virt_to_page(addr)); - set_page_count(virt_to_page(addr), 1); + init_page_count(virt_to_page(addr)); free_page(addr); totalram_pages++; } diff --git a/arch/frv/kernel/frv_ksyms.c b/arch/frv/kernel/frv_ksyms.c index 0f1c6cbc4f5..aa6b7d0a210 100644 --- a/arch/frv/kernel/frv_ksyms.c +++ b/arch/frv/kernel/frv_ksyms.c @@ -27,6 +27,7 @@ EXPORT_SYMBOL(__ioremap); EXPORT_SYMBOL(iounmap); EXPORT_SYMBOL(strnlen); +EXPORT_SYMBOL(strpbrk); EXPORT_SYMBOL(strrchr); EXPORT_SYMBOL(strstr); EXPORT_SYMBOL(strchr); diff --git a/arch/frv/mm/dma-alloc.c b/arch/frv/mm/dma-alloc.c index 342823aad75..636b2f8b5d9 100644 --- a/arch/frv/mm/dma-alloc.c +++ b/arch/frv/mm/dma-alloc.c @@ -115,9 +115,7 @@ void *consistent_alloc(gfp_t gfp, size_t size, dma_addr_t *dma_handle) */ if (order > 0) { struct page *rpage = virt_to_page(page); - - for (i = 1; i < (1 << order); i++) - set_page_count(rpage + i, 1); + split_page(rpage, order); } err = 0; diff --git a/arch/frv/mm/init.c b/arch/frv/mm/init.c index 765088ea8a5..8899aa1a4f0 100644 --- a/arch/frv/mm/init.c +++ b/arch/frv/mm/init.c @@ -169,7 +169,7 @@ void __init mem_init(void) struct page *page = &mem_map[pfn]; ClearPageReserved(page); - set_page_count(page, 1); + init_page_count(page); __free_page(page); totalram_pages++; } @@ -210,7 +210,7 @@ void __init free_initmem(void) /* next to check that the page we free is not a partial page */ for (addr = start; addr < end; addr += PAGE_SIZE) { ClearPageReserved(virt_to_page(addr)); - set_page_count(virt_to_page(addr), 1); + init_page_count(virt_to_page(addr)); free_page(addr); totalram_pages++; } @@ -230,7 +230,7 @@ void __init free_initrd_mem(unsigned long start, unsigned long end) int pages = 0; for (; start < end; start += PAGE_SIZE) { ClearPageReserved(virt_to_page(start)); - set_page_count(virt_to_page(start), 1); + init_page_count(virt_to_page(start)); free_page(start); totalram_pages++; pages++; diff --git a/arch/h8300/kernel/h8300_ksyms.c b/arch/h8300/kernel/h8300_ksyms.c index 5cc76efaf7a..69d6ad32d56 100644 --- a/arch/h8300/kernel/h8300_ksyms.c +++ b/arch/h8300/kernel/h8300_ksyms.c @@ -25,6 +25,7 @@ extern char h8300_debug_device[]; /* platform dependent support */ EXPORT_SYMBOL(strnlen); +EXPORT_SYMBOL(strpbrk); EXPORT_SYMBOL(strrchr); EXPORT_SYMBOL(strstr); EXPORT_SYMBOL(strchr); diff --git a/arch/h8300/mm/init.c b/arch/h8300/mm/init.c index 1e0929ddc8c..09efc4b1f03 100644 --- a/arch/h8300/mm/init.c +++ b/arch/h8300/mm/init.c @@ -196,7 +196,7 @@ void free_initrd_mem(unsigned long start, unsigned long end) int pages = 0; for (; start < end; start += PAGE_SIZE) { ClearPageReserved(virt_to_page(start)); - set_page_count(virt_to_page(start), 1); + init_page_count(virt_to_page(start)); free_page(start); totalram_pages++; pages++; @@ -219,7 +219,7 @@ free_initmem() /* next to check that the page we free is not a partial page */ for (; addr + PAGE_SIZE < (unsigned long)(&__init_end); addr +=PAGE_SIZE) { ClearPageReserved(virt_to_page(addr)); - set_page_count(virt_to_page(addr), 1); + init_page_count(virt_to_page(addr)); free_page(addr); totalram_pages++; } diff --git a/arch/i386/kernel/efi.c b/arch/i386/kernel/efi.c index c9cad7ba0d2..aeabb419686 100644 --- a/arch/i386/kernel/efi.c +++ b/arch/i386/kernel/efi.c @@ -115,7 +115,7 @@ static void efi_call_phys_epilog(void) unsigned long cr4; struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, 0); - cpu_gdt_descr->address = __va(cpu_gdt_descr->address); + cpu_gdt_descr->address = (unsigned long)__va(cpu_gdt_descr->address); load_gdt(cpu_gdt_descr); cr4 = read_cr4(); diff --git a/arch/i386/kernel/smp.c b/arch/i386/kernel/smp.c index 218d725a5a1..d134e9643a5 100644 --- a/arch/i386/kernel/smp.c +++ b/arch/i386/kernel/smp.c @@ -504,27 +504,23 @@ void unlock_ipi_call_lock(void) spin_unlock_irq(&call_lock); } -static struct call_data_struct * call_data; - -/* - * this function sends a 'generic call function' IPI to all other CPUs - * in the system. - */ - -int smp_call_function (void (*func) (void *info), void *info, int nonatomic, - int wait) -/* - * [SUMMARY] Run a function on all other CPUs. - * <func> The function to run. This must be fast and non-blocking. - * <info> An arbitrary pointer to pass to the function. - * <nonatomic> currently unused. - * <wait> If true, wait (atomically) until function has completed on other CPUs. - * [RETURNS] 0 on success, else a negative status code. Does not return until +static struct call_data_struct *call_data; + +/** + * smp_call_function(): Run a function on all other CPUs. + * @func: The function to run. This must be fast and non-blocking. + * @info: An arbitrary pointer to pass to the function. + * @nonatomic: currently unused. + * @wait: If true, wait (atomically) until function has completed on other CPUs. + * + * Returns 0 on success, else a negative status code. Does not return until * remote CPUs are nearly ready to execute <<func>> or are or have executed. * * You must not call this function with disabled interrupts or from a * hardware interrupt handler or from a bottom half handler. */ +int smp_call_function (void (*func) (void *info), void *info, int nonatomic, + int wait) { struct call_data_struct data; int cpus; diff --git a/arch/i386/kernel/sys_i386.c b/arch/i386/kernel/sys_i386.c index a4a61976ecb..8fdb1fb17a5 100644 --- a/arch/i386/kernel/sys_i386.c +++ b/arch/i386/kernel/sys_i386.c @@ -40,14 +40,13 @@ asmlinkage int sys_pipe(unsigned long __user * fildes) return error; } -/* common code for old and new mmaps */ -static inline long do_mmap2( - unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, - unsigned long fd, unsigned long pgoff) +asmlinkage long sys_mmap2(unsigned long addr, unsigned long len, + unsigned long prot, unsigned long flags, + unsigned long fd, unsigned long pgoff) { int error = -EBADF; - struct file * file = NULL; + struct file *file = NULL; + struct mm_struct *mm = current->mm; flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); if (!(flags & MAP_ANONYMOUS)) { @@ -56,9 +55,9 @@ static inline long do_mmap2( goto out; } - down_write(¤t->mm->mmap_sem); + down_write(&mm->mmap_sem); error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); - up_write(¤t->mm->mmap_sem); + up_write(&mm->mmap_sem); if (file) fput(file); @@ -66,13 +65,6 @@ out: return error; } -asmlinkage long sys_mmap2(unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, - unsigned long fd, unsigned long pgoff) -{ - return do_mmap2(addr, len, prot, flags, fd, pgoff); -} - /* * Perform the select(nd, in, out, ex, tv) and mmap() system * calls. Linux/i386 didn't use to be able to handle more than @@ -101,7 +93,8 @@ asmlinkage int old_mmap(struct mmap_arg_struct __user *arg) if (a.offset & ~PAGE_MASK) goto out; - err = do_mmap2(a.addr, a.len, a.prot, a.flags, a.fd, a.offset >> PAGE_SHIFT); + err = sys_mmap2(a.addr, a.len, a.prot, a.flags, + a.fd, a.offset >> PAGE_SHIFT); out: return err; } diff --git a/arch/i386/kernel/timers/timer_hpet.c b/arch/i386/kernel/timers/timer_hpet.c index be242723c33..17a6fe7166e 100644 --- a/arch/i386/kernel/timers/timer_hpet.c +++ b/arch/i386/kernel/timers/timer_hpet.c @@ -46,7 +46,7 @@ static seqlock_t monotonic_lock = SEQLOCK_UNLOCKED; * * -johnstul@us.ibm.com "math is hard, lets go shopping!" */ -static unsigned long cyc2ns_scale; +static unsigned long cyc2ns_scale __read_mostly; #define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */ static inline void set_cyc2ns_scale(unsigned long cpu_khz) diff --git a/arch/i386/kernel/timers/timer_tsc.c b/arch/i386/kernel/timers/timer_tsc.c index a7f5a2aceba..5e41ee29c8c 100644 --- a/arch/i386/kernel/timers/timer_tsc.c +++ b/arch/i386/kernel/timers/timer_tsc.c @@ -74,7 +74,7 @@ late_initcall(start_lost_tick_compensation); * * -johnstul@us.ibm.com "math is hard, lets go shopping!" */ -static unsigned long cyc2ns_scale; +static unsigned long cyc2ns_scale __read_mostly; #define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */ static inline void set_cyc2ns_scale(unsigned long cpu_khz) diff --git a/arch/i386/mm/hugetlbpage.c b/arch/i386/mm/hugetlbpage.c index d524127c9af..a7d89158541 100644 --- a/arch/i386/mm/hugetlbpage.c +++ b/arch/i386/mm/hugetlbpage.c @@ -48,18 +48,6 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) return (pte_t *) pmd; } -/* - * This function checks for proper alignment of input addr and len parameters. - */ -int is_aligned_hugepage_range(unsigned long addr, unsigned long len) -{ - if (len & ~HPAGE_MASK) - return -EINVAL; - if (addr & ~HPAGE_MASK) - return -EINVAL; - return 0; -} - #if 0 /* This is just for testing */ struct page * follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c index 2700f01994b..7ba55a6e2db 100644 --- a/arch/i386/mm/init.c +++ b/arch/i386/mm/init.c @@ -270,7 +270,7 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base) static void __meminit free_new_highpage(struct page *page) { - set_page_count(page, 1); + init_page_count(page); __free_page(page); totalhigh_pages++; } @@ -727,7 +727,7 @@ void free_initmem(void) addr = (unsigned long)(&__init_begin); for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) { ClearPageReserved(virt_to_page(addr)); - set_page_count(virt_to_page(addr), 1); + init_page_count(virt_to_page(addr)); memset((void *)addr, 0xcc, PAGE_SIZE); free_page(addr); totalram_pages++; @@ -766,7 +766,7 @@ void free_initrd_mem(unsigned long start, unsigned long end) printk (KERN_INFO "Freeing initrd memory: %ldk freed\n", (end - start) >> 10); for (; start < end; start += PAGE_SIZE) { ClearPageReserved(virt_to_page(start)); - set_page_count(virt_to_page(start), 1); + init_page_count(virt_to_page(start)); free_page(start); totalram_pages++; } diff --git a/arch/i386/mm/pageattr.c b/arch/i386/mm/pageattr.c index d0cadb33b54..92c3d9f0e73 100644 --- a/arch/i386/mm/pageattr.c +++ b/arch/i386/mm/pageattr.c @@ -51,6 +51,13 @@ static struct page *split_large_page(unsigned long address, pgprot_t prot, if (!base) return NULL; + /* + * page_private is used to track the number of entries in + * the page table page that have non standard attributes. + */ + SetPagePrivate(base); + page_private(base) = 0; + address = __pa(address); addr = address & LARGE_PAGE_MASK; pbase = (pte_t *)page_address(base); @@ -143,11 +150,12 @@ __change_page_attr(struct page *page, pgprot_t prot) return -ENOMEM; set_pmd_pte(kpte,address,mk_pte(split, ref_prot)); kpte_page = split; - } - get_page(kpte_page); + } + page_private(kpte_page)++; } else if ((pte_val(*kpte) & _PAGE_PSE) == 0) { set_pte_atomic(kpte, mk_pte(page, PAGE_KERNEL)); - __put_page(kpte_page); + BUG_ON(page_private(kpte_page) == 0); + page_private(kpte_page)--; } else BUG(); @@ -157,10 +165,8 @@ __change_page_attr(struct page *page, pgprot_t prot) * replace it with a largepage. */ if (!PageReserved(kpte_page)) { - /* memleak and potential failed 2M page regeneration */ - BUG_ON(!page_count(kpte_page)); - - if (cpu_has_pse && (page_count(kpte_page) == 1)) { + if (cpu_has_pse && (page_private(kpte_page) == 0)) { + ClearPagePrivate(kpte_page); list_add(&kpte_page->lru, &df_list); revert_page(kpte_page, address); } diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index a85ea9d37f0..ff7ae6b664e 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -271,6 +271,25 @@ config SCHED_SMT Intel IA64 chips with MultiThreading at a cost of slightly increased overhead in some places. If unsure say N here. +config PERMIT_BSP_REMOVE + bool "Support removal of Bootstrap Processor" + depends on HOTPLUG_CPU + default n + ---help--- + Say Y here if your platform SAL will support removal of BSP with HOTPLUG_CPU + support. + +config FORCE_CPEI_RETARGET + bool "Force assumption that CPEI can be re-targetted" + depends on PERMIT_BSP_REMOVE + default n + ---help--- + Say Y if you need to force the assumption that CPEI can be re-targetted to + any cpu in the system. This hint is available via ACPI 3.0 specifications. + Tiger4 systems are capable of re-directing CPEI to any CPU other than BSP. + This option it useful to enable this feature on older BIOS's as well. + You can also enable this by using boot command line option force_cpei=1. + config PREEMPT bool "Preemptible Kernel" help diff --git a/arch/ia64/configs/tiger_defconfig b/arch/ia64/configs/tiger_defconfig index 125568118b8..766bf495543 100644 --- a/arch/ia64/configs/tiger_defconfig +++ b/arch/ia64/configs/tiger_defconfig @@ -116,6 +116,8 @@ CONFIG_FORCE_MAX_ZONEORDER=17 CONFIG_SMP=y CONFIG_NR_CPUS=4 CONFIG_HOTPLUG_CPU=y +CONFIG_PERMIT_BSP_REMOVE=y +CONFIG_FORCE_CPEI_RETARGET=y # CONFIG_SCHED_SMT is not set # CONFIG_PREEMPT is not set CONFIG_SELECT_MEMORY_MODEL=y diff --git a/arch/ia64/kernel/acpi.c b/arch/ia64/kernel/acpi.c index ecd44bdc839..4722ec51c70 100644 --- a/arch/ia64/kernel/acpi.c +++ b/arch/ia64/kernel/acpi.c @@ -284,19 +284,24 @@ acpi_parse_plat_int_src(acpi_table_entry_header * header, return 0; } +#ifdef CONFIG_HOTPLUG_CPU unsigned int can_cpei_retarget(void) { extern int cpe_vector; + extern unsigned int force_cpei_retarget; /* * Only if CPEI is supported and the override flag * is present, otherwise return that its re-targettable * if we are in polling mode. */ - if (cpe_vector > 0 && !acpi_cpei_override) - return 0; - else - return 1; + if (cpe_vector > 0) { + if (acpi_cpei_override || force_cpei_retarget) + return 1; + else + return 0; + } + return 1; } unsigned int is_cpu_cpei_target(unsigned int cpu) @@ -315,6 +320,7 @@ void set_cpei_target_cpu(unsigned int cpu) { acpi_cpei_phys_cpuid = cpu_physical_id(cpu); } +#endif unsigned int get_cpei_target_cpu(void) { diff --git a/arch/ia64/kernel/entry.S b/arch/ia64/kernel/entry.S index 930fdfca6dd..0e3eda99e54 100644 --- a/arch/ia64/kernel/entry.S +++ b/arch/ia64/kernel/entry.S @@ -1102,9 +1102,6 @@ skip_rbs_switch: st8 [r2]=r8 st8 [r3]=r10 .work_pending: - tbit.nz p6,p0=r31,TIF_SIGDELAYED // signal delayed from MCA/INIT/NMI/PMI context? -(p6) br.cond.sptk.few .sigdelayed - ;; tbit.z p6,p0=r31,TIF_NEED_RESCHED // current_thread_info()->need_resched==0? (p6) br.cond.sptk.few .notify #ifdef CONFIG_PREEMPT @@ -1131,17 +1128,6 @@ skip_rbs_switch: (pLvSys)br.cond.sptk.few .work_pending_syscall_end br.cond.sptk.many .work_processed_kernel // don't re-check -// There is a delayed signal that was detected in MCA/INIT/NMI/PMI context where -// it could not be delivered. Deliver it now. The signal might be for us and -// may set TIF_SIGPENDING, so redrive ia64_leave_* after processing the delayed -// signal. - -.sigdelayed: - br.call.sptk.many rp=do_sigdelayed - cmp.eq p6,p0=r0,r0 // p6 <- 1, always re-check -(pLvSys)br.cond.sptk.few .work_pending_syscall_end - br.cond.sptk.many .work_processed_kernel // re-check - .work_pending_syscall_end: adds r2=PT(R8)+16,r12 adds r3=PT(R10)+16,r12 diff --git a/arch/ia64/kernel/iosapic.c b/arch/ia64/kernel/iosapic.c index 574084f343f..8832c553230 100644 --- a/arch/ia64/kernel/iosapic.c +++ b/arch/ia64/kernel/iosapic.c @@ -631,6 +631,7 @@ get_target_cpu (unsigned int gsi, int vector) { #ifdef CONFIG_SMP static int cpu = -1; + extern int cpe_vector; /* * In case of vector shared by multiple RTEs, all RTEs that @@ -653,6 +654,11 @@ get_target_cpu (unsigned int gsi, int vector) if (!cpu_online(smp_processor_id())) return cpu_physical_id(smp_processor_id()); +#ifdef CONFIG_ACPI + if (cpe_vector > 0 && vector == IA64_CPEP_VECTOR) + return get_cpei_target_cpu(); +#endif + #ifdef CONFIG_NUMA { int num_cpus, cpu_index, iosapic_index, numa_cpu, i = 0; diff --git a/arch/ia64/kernel/irq.c b/arch/ia64/kernel/irq.c index d33244c3275..5ce908ef9c9 100644 --- a/arch/ia64/kernel/irq.c +++ b/arch/ia64/kernel/irq.c @@ -163,8 +163,19 @@ void fixup_irqs(void) { unsigned int irq; extern void ia64_process_pending_intr(void); + extern void ia64_disable_timer(void); + extern volatile int time_keeper_id; + + ia64_disable_timer(); + + /* + * Find a new timesync master + */ + if (smp_processor_id() == time_keeper_id) { + time_keeper_id = first_cpu(cpu_online_map); + printk ("CPU %d is now promoted to time-keeper master\n", time_keeper_id); + } - ia64_set_itv(1<<16); /* * Phase 1: Locate irq's bound to this cpu and * relocate them for cpu removal. diff --git a/arch/ia64/kernel/mca.c b/arch/ia64/kernel/mca.c index ee7eec9ee57..b57e723f194 100644 --- a/arch/ia64/kernel/mca.c +++ b/arch/ia64/kernel/mca.c @@ -281,14 +281,10 @@ ia64_mca_log_sal_error_record(int sal_info_type) ia64_sal_clear_state_info(sal_info_type); } -/* - * platform dependent error handling - */ -#ifndef PLATFORM_MCA_HANDLERS - #ifdef CONFIG_ACPI int cpe_vector = -1; +int ia64_cpe_irq = -1; static irqreturn_t ia64_mca_cpe_int_handler (int cpe_irq, void *arg, struct pt_regs *ptregs) @@ -377,8 +373,6 @@ ia64_mca_register_cpev (int cpev) } #endif /* CONFIG_ACPI */ -#endif /* PLATFORM_MCA_HANDLERS */ - /* * ia64_mca_cmc_vector_setup * @@ -630,6 +624,32 @@ copy_reg(const u64 *fr, u64 fnat, u64 *tr, u64 *tnat) *tnat |= (nat << tslot); } +/* Change the comm field on the MCA/INT task to include the pid that + * was interrupted, it makes for easier debugging. If that pid was 0 + * (swapper or nested MCA/INIT) then use the start of the previous comm + * field suffixed with its cpu. + */ + +static void +ia64_mca_modify_comm(const task_t *previous_current) +{ + char *p, comm[sizeof(current->comm)]; + if (previous_current->pid) + snprintf(comm, sizeof(comm), "%s %d", + current->comm, previous_current->pid); + else { + int l; + if ((p = strchr(previous_current->comm, ' '))) + l = p - previous_current->comm; + else + l = strlen(previous_current->comm); + snprintf(comm, sizeof(comm), "%s %*s %d", + current->comm, l, previous_current->comm, + task_thread_info(previous_current)->cpu); + } + memcpy(current->comm, comm, sizeof(current->comm)); +} + /* On entry to this routine, we are running on the per cpu stack, see * mca_asm.h. The original stack has not been touched by this event. Some of * the original stack's registers will be in the RBS on this stack. This stack @@ -648,7 +668,7 @@ ia64_mca_modify_original_stack(struct pt_regs *regs, struct ia64_sal_os_state *sos, const char *type) { - char *p, comm[sizeof(current->comm)]; + char *p; ia64_va va; extern char ia64_leave_kernel[]; /* Need asm address, not function descriptor */ const pal_min_state_area_t *ms = sos->pal_min_state; @@ -721,6 +741,10 @@ ia64_mca_modify_original_stack(struct pt_regs *regs, /* Verify the previous stack state before we change it */ if (user_mode(regs)) { msg = "occurred in user space"; + /* previous_current is guaranteed to be valid when the task was + * in user space, so ... + */ + ia64_mca_modify_comm(previous_current); goto no_mod; } if (r13 != sos->prev_IA64_KR_CURRENT) { @@ -750,25 +774,7 @@ ia64_mca_modify_original_stack(struct pt_regs *regs, goto no_mod; } - /* Change the comm field on the MCA/INT task to include the pid that - * was interrupted, it makes for easier debugging. If that pid was 0 - * (swapper or nested MCA/INIT) then use the start of the previous comm - * field suffixed with its cpu. - */ - if (previous_current->pid) - snprintf(comm, sizeof(comm), "%s %d", - current->comm, previous_current->pid); - else { - int l; - if ((p = strchr(previous_current->comm, ' '))) - l = p - previous_current->comm; - else - l = strlen(previous_current->comm); - snprintf(comm, sizeof(comm), "%s %*s %d", - current->comm, l, previous_current->comm, - task_thread_info(previous_current)->cpu); - } - memcpy(current->comm, comm, sizeof(current->comm)); + ia64_mca_modify_comm(previous_current); /* Make the original task look blocked. First stack a struct pt_regs, * describing the state at the time of interrupt. mca_asm.S built a @@ -908,7 +914,7 @@ no_mod: static void ia64_wait_for_slaves(int monarch) { - int c, wait = 0; + int c, wait = 0, missing = 0; for_each_online_cpu(c) { if (c == monarch) continue; @@ -919,15 +925,32 @@ ia64_wait_for_slaves(int monarch) } } if (!wait) - return; + goto all_in; for_each_online_cpu(c) { if (c == monarch) continue; if (ia64_mc_info.imi_rendez_checkin[c] == IA64_MCA_RENDEZ_CHECKIN_NOTDONE) { udelay(5*1000000); /* wait 5 seconds for slaves (arbitrary) */ + if (ia64_mc_info.imi_rendez_checkin[c] == IA64_MCA_RENDEZ_CHECKIN_NOTDONE) + missing = 1; break; } } + if (!missing) + goto all_in; + printk(KERN_INFO "OS MCA slave did not rendezvous on cpu"); + for_each_online_cpu(c) { + if (c == monarch) + continue; + if (ia64_mc_info.imi_rendez_checkin[c] == IA64_MCA_RENDEZ_CHECKIN_NOTDONE) + printk(" %d", c); + } + printk("\n"); + return; + +all_in: + printk(KERN_INFO "All OS MCA slaves have reached rendezvous\n"); + return; } /* @@ -953,6 +976,10 @@ ia64_mca_handler(struct pt_regs *regs, struct switch_stack *sw, task_t *previous_current; oops_in_progress = 1; /* FIXME: make printk NMI/MCA/INIT safe */ + console_loglevel = 15; /* make sure printks make it to console */ + printk(KERN_INFO "Entered OS MCA handler. PSP=%lx cpu=%d monarch=%ld\n", + sos->proc_state_param, cpu, sos->monarch); + previous_current = ia64_mca_modify_original_stack(regs, sw, sos, "MCA"); monarch_cpu = cpu; if (notify_die(DIE_MCA_MONARCH_ENTER, "MCA", regs, 0, 0, 0) @@ -1444,11 +1471,13 @@ void __devinit ia64_mca_cpu_init(void *cpu_data) { void *pal_vaddr; + static int first_time = 1; - if (smp_processor_id() == 0) { + if (first_time) { void *mca_data; int cpu; + first_time = 0; mca_data = alloc_bootmem(sizeof(struct ia64_mca_cpu) * NR_CPUS + KERNEL_STACK_SIZE); mca_data = (void *)(((unsigned long)mca_data + @@ -1704,6 +1733,7 @@ ia64_mca_late_init(void) desc = irq_descp(irq); desc->status |= IRQ_PER_CPU; setup_irq(irq, &mca_cpe_irqaction); + ia64_cpe_irq = irq; } ia64_mca_register_cpev(cpe_vector); IA64_MCA_DEBUG("%s: CPEI/P setup and enabled.\n", __FUNCTION__); diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c index 9c5194b385d..077f21216b6 100644 --- a/arch/ia64/kernel/perfmon.c +++ b/arch/ia64/kernel/perfmon.c @@ -6722,6 +6722,7 @@ __initcall(pfm_init); void pfm_init_percpu (void) { + static int first_time=1; /* * make sure no measurement is active * (may inherit programmed PMCs from EFI). @@ -6734,8 +6735,10 @@ pfm_init_percpu (void) */ pfm_unfreeze_pmu(); - if (smp_processor_id() == 0) + if (first_time) { register_percpu_irq(IA64_PERFMON_VECTOR, &perfmon_irqaction); + first_time=0; + } ia64_setreg(_IA64_REG_CR_PMV, IA64_PERFMON_VECTOR); ia64_srlz_d(); diff --git a/arch/ia64/kernel/signal.c b/arch/ia64/kernel/signal.c index 463f6bb44d0..1d7903ee212 100644 --- a/arch/ia64/kernel/signal.c +++ b/arch/ia64/kernel/signal.c @@ -588,104 +588,3 @@ ia64_do_signal (sigset_t *oldset, struct sigscratch *scr, long in_syscall) } return 0; } - -/* Set a delayed signal that was detected in MCA/INIT/NMI/PMI context where it - * could not be delivered. It is important that the target process is not - * allowed to do any more work in user space. Possible cases for the target - * process: - * - * - It is sleeping and will wake up soon. Store the data in the current task, - * the signal will be sent when the current task returns from the next - * interrupt. - * - * - It is running in user context. Store the data in the current task, the - * signal will be sent when the current task returns from the next interrupt. - * - * - It is running in kernel context on this or another cpu and will return to - * user context. Store the data in the target task, the signal will be sent - * to itself when the target task returns to user space. - * - * - It is running in kernel context on this cpu and will sleep before - * returning to user context. Because this is also the current task, the - * signal will not get delivered and the task could sleep indefinitely. - * Store the data in the idle task for this cpu, the signal will be sent - * after the idle task processes its next interrupt. - * - * To cover all cases, store the data in the target task, the current task and - * the idle task on this cpu. Whatever happens, the signal will be delivered - * to the target task before it can do any useful user space work. Multiple - * deliveries have no unwanted side effects. - * - * Note: This code is executed in MCA/INIT/NMI/PMI context, with interrupts - * disabled. It must not take any locks nor use kernel structures or services - * that require locks. - */ - -/* To ensure that we get the right pid, check its start time. To avoid extra - * include files in thread_info.h, convert the task start_time to unsigned long, - * giving us a cycle time of > 580 years. - */ -static inline unsigned long -start_time_ul(const struct task_struct *t) -{ - return t->start_time.tv_sec * NSEC_PER_SEC + t->start_time.tv_nsec; -} - -void -set_sigdelayed(pid_t pid, int signo, int code, void __user *addr) -{ - struct task_struct *t; - unsigned long start_time = 0; - int i; - - for (i = 1; i <= 3; ++i) { - switch (i) { - case 1: - t = find_task_by_pid(pid); - if (t) - start_time = start_time_ul(t); - break; - case 2: - t = current; - break; - default: - t = idle_task(smp_processor_id()); - break; - } - - if (!t) - return; - task_thread_info(t)->sigdelayed.signo = signo; - task_thread_info(t)->sigdelayed.code = code; - task_thread_info(t)->sigdelayed.addr = addr; - task_thread_info(t)->sigdelayed.start_time = start_time; - task_thread_info(t)->sigdelayed.pid = pid; - wmb(); - set_tsk_thread_flag(t, TIF_SIGDELAYED); - } -} - -/* Called from entry.S when it detects TIF_SIGDELAYED, a delayed signal that - * was detected in MCA/INIT/NMI/PMI context where it could not be delivered. - */ - -void -do_sigdelayed(void) -{ - struct siginfo siginfo; - pid_t pid; - struct task_struct *t; - - clear_thread_flag(TIF_SIGDELAYED); - memset(&siginfo, 0, sizeof(siginfo)); - siginfo.si_signo = current_thread_info()->sigdelayed.signo; - siginfo.si_code = current_thread_info()->sigdelayed.code; - siginfo.si_addr = current_thread_info()->sigdelayed.addr; - pid = current_thread_info()->sigdelayed.pid; - t = find_task_by_pid(pid); - if (!t) - return; - if (current_thread_info()->sigdelayed.start_time != start_time_ul(t)) - return; - force_sig_info(siginfo.si_signo, &siginfo, t); -} diff --git a/arch/ia64/kernel/smpboot.c b/arch/ia64/kernel/smpboot.c index b681ef34a86..c4b633b36da 100644 --- a/arch/ia64/kernel/smpboot.c +++ b/arch/ia64/kernel/smpboot.c @@ -70,6 +70,12 @@ #endif #ifdef CONFIG_HOTPLUG_CPU +#ifdef CONFIG_PERMIT_BSP_REMOVE +#define bsp_remove_ok 1 +#else +#define bsp_remove_ok 0 +#endif + /* * Store all idle threads, this can be reused instead of creating * a new thread. Also avoids complicated thread destroy functionality @@ -104,7 +110,7 @@ struct sal_to_os_boot *sal_state_for_booting_cpu = &sal_boot_rendez_state[0]; /* * ITC synchronization related stuff: */ -#define MASTER 0 +#define MASTER (0) #define SLAVE (SMP_CACHE_BYTES/8) #define NUM_ROUNDS 64 /* magic value */ @@ -151,6 +157,27 @@ char __initdata no_int_routing; unsigned char smp_int_redirect; /* are INT and IPI redirectable by the chipset? */ +#ifdef CONFIG_FORCE_CPEI_RETARGET +#define CPEI_OVERRIDE_DEFAULT (1) +#else +#define CPEI_OVERRIDE_DEFAULT (0) +#endif + +unsigned int force_cpei_retarget = CPEI_OVERRIDE_DEFAULT; + +static int __init +cmdl_force_cpei(char *str) +{ + int value=0; + + get_option (&str, &value); + force_cpei_retarget = value; + + return 1; +} + +__setup("force_cpei=", cmdl_force_cpei); + static int __init nointroute (char *str) { @@ -161,6 +188,27 @@ nointroute (char *str) __setup("nointroute", nointroute); +static void fix_b0_for_bsp(void) +{ +#ifdef CONFIG_HOTPLUG_CPU + int cpuid; + static int fix_bsp_b0 = 1; + + cpuid = smp_processor_id(); + + /* + * Cache the b0 value on the first AP that comes up + */ + if (!(fix_bsp_b0 && cpuid)) + return; + + sal_boot_rendez_state[0].br[0] = sal_boot_rendez_state[cpuid].br[0]; + printk ("Fixed BSP b0 value from CPU %d\n", cpuid); + + fix_bsp_b0 = 0; +#endif +} + void sync_master (void *arg) { @@ -327,8 +375,9 @@ smp_setup_percpu_timer (void) static void __devinit smp_callin (void) { - int cpuid, phys_id; + int cpuid, phys_id, itc_master; extern void ia64_init_itm(void); + extern volatile int time_keeper_id; #ifdef CONFIG_PERFMON extern void pfm_init_percpu(void); @@ -336,6 +385,7 @@ smp_callin (void) cpuid = smp_processor_id(); phys_id = hard_smp_processor_id(); + itc_master = time_keeper_id; if (cpu_online(cpuid)) { printk(KERN_ERR "huh, phys CPU#0x%x, CPU#0x%x already present??\n", @@ -343,6 +393,8 @@ smp_callin (void) BUG(); } + fix_b0_for_bsp(); + lock_ipi_calllock(); cpu_set(cpuid, cpu_online_map); unlock_ipi_calllock(); @@ -365,8 +417,8 @@ smp_callin (void) * calls spin_unlock_bh(), which calls spin_unlock_bh(), which calls * local_bh_enable(), which bugs out if irqs are not enabled... */ - Dprintk("Going to syncup ITC with BP.\n"); - ia64_sync_itc(0); + Dprintk("Going to syncup ITC with ITC Master.\n"); + ia64_sync_itc(itc_master); } /* @@ -635,6 +687,47 @@ remove_siblinginfo(int cpu) } extern void fixup_irqs(void); + +int migrate_platform_irqs(unsigned int cpu) +{ + int new_cpei_cpu; + irq_desc_t *desc = NULL; + cpumask_t mask; + int retval = 0; + + /* + * dont permit CPEI target to removed. + */ + if (cpe_vector > 0 && is_cpu_cpei_target(cpu)) { + printk ("CPU (%d) is CPEI Target\n", cpu); + if (can_cpei_retarget()) { + /* + * Now re-target the CPEI to a different processor + */ + new_cpei_cpu = any_online_cpu(cpu_online_map); + mask = cpumask_of_cpu(new_cpei_cpu); + set_cpei_target_cpu(new_cpei_cpu); + desc = irq_descp(ia64_cpe_irq); + /* + * Switch for now, immediatly, we need to do fake intr + * as other interrupts, but need to study CPEI behaviour with + * polling before making changes. + */ + if (desc) { + desc->handler->disable(ia64_cpe_irq); + desc->handler->set_affinity(ia64_cpe_irq, mask); + desc->handler->enable(ia64_cpe_irq); + printk ("Re-targetting CPEI to cpu %d\n", new_cpei_cpu); + } + } + if (!desc) { + printk ("Unable to retarget CPEI, offline cpu [%d] failed\n", cpu); + retval = -EBUSY; + } + } + return retval; +} + /* must be called with cpucontrol mutex held */ int __cpu_disable(void) { @@ -643,8 +736,17 @@ int __cpu_disable(void) /* * dont permit boot processor for now */ - if (cpu == 0) - return -EBUSY; + if (cpu == 0 && !bsp_remove_ok) { + printk ("Your platform does not support removal of BSP\n"); + return (-EBUSY); + } + + cpu_clear(cpu, cpu_online_map); + + if (migrate_platform_irqs(cpu)) { + cpu_set(cpu, cpu_online_map); + return (-EBUSY); + } remove_siblinginfo(cpu); cpu_clear(cpu, cpu_online_map); diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c index 307d01e15b2..ac167436e93 100644 --- a/arch/ia64/kernel/time.c +++ b/arch/ia64/kernel/time.c @@ -32,7 +32,7 @@ extern unsigned long wall_jiffies; -#define TIME_KEEPER_ID 0 /* smp_processor_id() of time-keeper */ +volatile int time_keeper_id = 0; /* smp_processor_id() of time-keeper */ #ifdef CONFIG_IA64_DEBUG_IRQ @@ -71,7 +71,7 @@ timer_interrupt (int irq, void *dev_id, struct pt_regs *regs) new_itm += local_cpu_data->itm_delta; - if (smp_processor_id() == TIME_KEEPER_ID) { + if (smp_processor_id() == time_keeper_id) { /* * Here we are in the timer irq handler. We have irqs locally * disabled, but we don't know if the timer_bh is running on @@ -236,6 +236,11 @@ static struct irqaction timer_irqaction = { .name = "timer" }; +void __devinit ia64_disable_timer(void) +{ + ia64_set_itv(1 << 16); +} + void __init time_init (void) { diff --git a/arch/ia64/kernel/topology.c b/arch/ia64/kernel/topology.c index 6e5eea19fa6..3b6fd798c4d 100644 --- a/arch/ia64/kernel/topology.c +++ b/arch/ia64/kernel/topology.c @@ -36,7 +36,7 @@ int arch_register_cpu(int num) parent = &sysfs_nodes[cpu_to_node(num)]; #endif /* CONFIG_NUMA */ -#ifdef CONFIG_ACPI +#if defined (CONFIG_ACPI) && defined (CONFIG_HOTPLUG_CPU) /* * If CPEI cannot be re-targetted, and this is * CPEI target, then dont create the control file diff --git a/arch/ia64/mm/contig.c b/arch/ia64/mm/contig.c index acaaec4e468..9855ba31809 100644 --- a/arch/ia64/mm/contig.c +++ b/arch/ia64/mm/contig.c @@ -181,13 +181,15 @@ per_cpu_init (void) { void *cpu_data; int cpu; + static int first_time=1; /* * get_free_pages() cannot be used before cpu_init() done. BSP * allocates "NR_CPUS" pages for all CPUs to avoid that AP calls * get_zeroed_page(). */ - if (smp_processor_id() == 0) { + if (first_time) { + first_time=0; cpu_data = __alloc_bootmem(PERCPU_PAGE_SIZE * NR_CPUS, PERCPU_PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); for (cpu = 0; cpu < NR_CPUS; cpu++) { diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c index c87d6d1d581..573d5cc63e2 100644 --- a/arch/ia64/mm/discontig.c +++ b/arch/ia64/mm/discontig.c @@ -528,12 +528,17 @@ void __init find_memory(void) void *per_cpu_init(void) { int cpu; + static int first_time = 1; + if (smp_processor_id() != 0) return __per_cpu_start + __per_cpu_offset[smp_processor_id()]; - for (cpu = 0; cpu < NR_CPUS; cpu++) - per_cpu(local_per_cpu_offset, cpu) = __per_cpu_offset[cpu]; + if (first_time) { + first_time = 0; + for (cpu = 0; cpu < NR_CPUS; cpu++) + per_cpu(local_per_cpu_offset, cpu) = __per_cpu_offset[cpu]; + } return __per_cpu_start + __per_cpu_offset[smp_processor_id()]; } diff --git a/arch/ia64/mm/hugetlbpage.c b/arch/ia64/mm/hugetlbpage.c index 2d13889d0a9..9dbc7dadd16 100644 --- a/arch/ia64/mm/hugetlbpage.c +++ b/arch/ia64/mm/hugetlbpage.c @@ -68,9 +68,10 @@ huge_pte_offset (struct mm_struct *mm, unsigned long addr) #define mk_pte_huge(entry) { pte_val(entry) |= _PAGE_P; } /* - * This function checks for proper alignment of input addr and len parameters. + * Don't actually need to do any preparation, but need to make sure + * the address is in the right region. */ -int is_aligned_hugepage_range(unsigned long addr, unsigned long len) +int prepare_hugepage_range(unsigned long addr, unsigned long len) { if (len & ~HPAGE_MASK) return -EINVAL; diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index b38b6d213c1..08d94e6bfa1 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -197,7 +197,7 @@ free_initmem (void) eaddr = (unsigned long) ia64_imva(__init_end); while (addr < eaddr) { ClearPageReserved(virt_to_page(addr)); - set_page_count(virt_to_page(addr), 1); + init_page_count(virt_to_page(addr)); free_page(addr); ++totalram_pages; addr += PAGE_SIZE; @@ -252,7 +252,7 @@ free_initrd_mem (unsigned long start, unsigned long end) continue; page = virt_to_page(start); ClearPageReserved(page); - set_page_count(page, 1); + init_page_count(page); free_page(start); ++totalram_pages; } @@ -640,7 +640,7 @@ mem_init (void) void online_page(struct page *page) { ClearPageReserved(page); - set_page_count(page, 1); + init_page_count(page); __free_page(page); totalram_pages++; num_physpages++; diff --git a/arch/ia64/sn/kernel/Makefile b/arch/ia64/sn/kernel/Makefile index 3e9b4eea741..ab9c48c8801 100644 --- a/arch/ia64/sn/kernel/Makefile +++ b/arch/ia64/sn/kernel/Makefile @@ -10,7 +10,8 @@ CPPFLAGS += -I$(srctree)/arch/ia64/sn/include obj-y += setup.o bte.o bte_error.o irq.o mca.o idle.o \ - huberror.o io_init.o iomv.o klconflib.o sn2/ + huberror.o io_init.o iomv.o klconflib.o pio_phys.o \ + sn2/ obj-$(CONFIG_IA64_GENERIC) += machvec.o obj-$(CONFIG_SGI_TIOCX) += tiocx.o obj-$(CONFIG_IA64_SGI_SN_XP) += xp.o diff --git a/arch/ia64/sn/kernel/pio_phys.S b/arch/ia64/sn/kernel/pio_phys.S new file mode 100644 index 00000000000..3c7d48d6ecb --- /dev/null +++ b/arch/ia64/sn/kernel/pio_phys.S @@ -0,0 +1,71 @@ +/* + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + * + * Copyright (C) 2000-2005 Silicon Graphics, Inc. All rights reserved. + * + * This file contains macros used to access MMR registers via + * uncached physical addresses. + * pio_phys_read_mmr - read an MMR + * pio_phys_write_mmr - write an MMR + * pio_atomic_phys_write_mmrs - atomically write 1 or 2 MMRs with psr.ic=0 + * Second MMR will be skipped if address is NULL + * + * Addresses passed to these routines should be uncached physical addresses + * ie., 0x80000.... + */ + + + +#include <asm/asmmacro.h> +#include <asm/page.h> + +GLOBAL_ENTRY(pio_phys_read_mmr) + .prologue + .regstk 1,0,0,0 + .body + mov r2=psr + rsm psr.i | psr.dt + ;; + srlz.d + ld8.acq r8=[r32] + ;; + mov psr.l=r2;; + srlz.d + br.ret.sptk.many rp +END(pio_phys_read_mmr) + +GLOBAL_ENTRY(pio_phys_write_mmr) + .prologue + .regstk 2,0,0,0 + .body + mov r2=psr + rsm psr.i | psr.dt + ;; + srlz.d + st8.rel [r32]=r33 + ;; + mov psr.l=r2;; + srlz.d + br.ret.sptk.many rp +END(pio_phys_write_mmr) + +GLOBAL_ENTRY(pio_atomic_phys_write_mmrs) + .prologue + .regstk 4,0,0,0 + .body + mov r2=psr + cmp.ne p9,p0=r34,r0; + rsm psr.i | psr.dt | psr.ic + ;; + srlz.d + st8.rel [r32]=r33 +(p9) st8.rel [r34]=r35 + ;; + mov psr.l=r2;; + srlz.d + br.ret.sptk.many rp +END(pio_atomic_phys_write_mmrs) + + diff --git a/arch/ia64/sn/kernel/setup.c b/arch/ia64/sn/kernel/setup.c index 5b84836c217..8b6d5c84470 100644 --- a/arch/ia64/sn/kernel/setup.c +++ b/arch/ia64/sn/kernel/setup.c @@ -3,7 +3,7 @@ * License. See the file "COPYING" in the main directory of this archive * for more details. * - * Copyright (C) 1999,2001-2005 Silicon Graphics, Inc. All rights reserved. + * Copyright (C) 1999,2001-2006 Silicon Graphics, Inc. All rights reserved. */ #include <linux/config.h> @@ -498,6 +498,7 @@ void __init sn_setup(char **cmdline_p) * for sn. */ pm_power_off = ia64_sn_power_down; + current->thread.flags |= IA64_THREAD_MIGRATION; } /** @@ -660,7 +661,8 @@ void __init sn_cpu_init(void) SH2_PIO_WRITE_STATUS_1, SH2_PIO_WRITE_STATUS_3}; u64 *pio; pio = is_shub1() ? pio1 : pio2; - pda->pio_write_status_addr = (volatile unsigned long *) LOCAL_MMR_ADDR(pio[slice]); + pda->pio_write_status_addr = + (volatile unsigned long *)GLOBAL_MMR_ADDR(nasid, pio[slice]); pda->pio_write_status_val = is_shub1() ? SH_PIO_WRITE_STATUS_PENDING_WRITE_COUNT_MASK : 0; } diff --git a/arch/ia64/sn/kernel/sn2/sn2_smp.c b/arch/ia64/sn/kernel/sn2/sn2_smp.c index b2e1e746b47..d9d306c79f2 100644 --- a/arch/ia64/sn/kernel/sn2/sn2_smp.c +++ b/arch/ia64/sn/kernel/sn2/sn2_smp.c @@ -93,6 +93,27 @@ static inline unsigned long wait_piowc(void) return (ws & SH_PIO_WRITE_STATUS_WRITE_DEADLOCK_MASK) != 0; } +/** + * sn_migrate - SN-specific task migration actions + * @task: Task being migrated to new CPU + * + * SN2 PIO writes from separate CPUs are not guaranteed to arrive in order. + * Context switching user threads which have memory-mapped MMIO may cause + * PIOs to issue from seperate CPUs, thus the PIO writes must be drained + * from the previous CPU's Shub before execution resumes on the new CPU. + */ +void sn_migrate(struct task_struct *task) +{ + pda_t *last_pda = pdacpu(task_thread_info(task)->last_cpu); + volatile unsigned long *adr = last_pda->pio_write_status_addr; + unsigned long val = last_pda->pio_write_status_val; + + /* Drain PIO writes from old CPU's Shub */ + while (unlikely((*adr & SH_PIO_WRITE_STATUS_PENDING_WRITE_COUNT_MASK) + != val)) + cpu_relax(); +} + void sn_tlb_migrate_finish(struct mm_struct *mm) { /* flush_tlb_mm is inefficient if more than 1 users of mm */ diff --git a/arch/ia64/sn/kernel/xpc_channel.c b/arch/ia64/sn/kernel/xpc_channel.c index cdf6856ce08..d0abddd9ffe 100644 --- a/arch/ia64/sn/kernel/xpc_channel.c +++ b/arch/ia64/sn/kernel/xpc_channel.c @@ -21,7 +21,6 @@ #include <linux/sched.h> #include <linux/cache.h> #include <linux/interrupt.h> -#include <linux/slab.h> #include <linux/mutex.h> #include <linux/completion.h> #include <asm/sn/bte.h> @@ -30,6 +29,31 @@ /* + * Guarantee that the kzalloc'd memory is cacheline aligned. + */ +static void * +xpc_kzalloc_cacheline_aligned(size_t size, gfp_t flags, void **base) +{ + /* see if kzalloc will give us cachline aligned memory by default */ + *base = kzalloc(size, flags); + if (*base == NULL) { + return NULL; + } + if ((u64) *base == L1_CACHE_ALIGN((u64) *base)) { + return *base; + } + kfree(*base); + + /* nope, we'll have to do it ourselves */ + *base = kzalloc(size + L1_CACHE_BYTES, flags); + if (*base == NULL) { + return NULL; + } + return (void *) L1_CACHE_ALIGN((u64) *base); +} + + +/* * Set up the initial values for the XPartition Communication channels. */ static void @@ -93,20 +117,19 @@ xpc_setup_infrastructure(struct xpc_partition *part) * Allocate all of the channel structures as a contiguous chunk of * memory. */ - part->channels = kmalloc(sizeof(struct xpc_channel) * XPC_NCHANNELS, + part->channels = kzalloc(sizeof(struct xpc_channel) * XPC_NCHANNELS, GFP_KERNEL); if (part->channels == NULL) { dev_err(xpc_chan, "can't get memory for channels\n"); return xpcNoMemory; } - memset(part->channels, 0, sizeof(struct xpc_channel) * XPC_NCHANNELS); part->nchannels = XPC_NCHANNELS; /* allocate all the required GET/PUT values */ - part->local_GPs = xpc_kmalloc_cacheline_aligned(XPC_GP_SIZE, + part->local_GPs = xpc_kzalloc_cacheline_aligned(XPC_GP_SIZE, GFP_KERNEL, &part->local_GPs_base); if (part->local_GPs == NULL) { kfree(part->channels); @@ -115,55 +138,51 @@ xpc_setup_infrastructure(struct xpc_partition *part) "values\n"); return xpcNoMemory; } - memset(part->local_GPs, 0, XPC_GP_SIZE); - part->remote_GPs = xpc_kmalloc_cacheline_aligned(XPC_GP_SIZE, + part->remote_GPs = xpc_kzalloc_cacheline_aligned(XPC_GP_SIZE, GFP_KERNEL, &part->remote_GPs_base); if (part->remote_GPs == NULL) { - kfree(part->channels); - part->channels = NULL; - kfree(part->local_GPs_base); - part->local_GPs = NULL; dev_err(xpc_chan, "can't get memory for remote get/put " "values\n"); + kfree(part->local_GPs_base); + part->local_GPs = NULL; + kfree(part->channels); + part->channels = NULL; return xpcNoMemory; } - memset(part->remote_GPs, 0, XPC_GP_SIZE); /* allocate all the required open and close args */ - part->local_openclose_args = xpc_kmalloc_cacheline_aligned( + part->local_openclose_args = xpc_kzalloc_cacheline_aligned( XPC_OPENCLOSE_ARGS_SIZE, GFP_KERNEL, &part->local_openclose_args_base); if (part->local_openclose_args == NULL) { - kfree(part->channels); - part->channels = NULL; - kfree(part->local_GPs_base); - part->local_GPs = NULL; + dev_err(xpc_chan, "can't get memory for local connect args\n"); kfree(part->remote_GPs_base); part->remote_GPs = NULL; - dev_err(xpc_chan, "can't get memory for local connect args\n"); + kfree(part->local_GPs_base); + part->local_GPs = NULL; + kfree(part->channels); + part->channels = NULL; return xpcNoMemory; } - memset(part->local_openclose_args, 0, XPC_OPENCLOSE_ARGS_SIZE); - part->remote_openclose_args = xpc_kmalloc_cacheline_aligned( + part->remote_openclose_args = xpc_kzalloc_cacheline_aligned( XPC_OPENCLOSE_ARGS_SIZE, GFP_KERNEL, &part->remote_openclose_args_base); if (part->remote_openclose_args == NULL) { - kfree(part->channels); - part->channels = NULL; - kfree(part->local_GPs_base); - part->local_GPs = NULL; - kfree(part->remote_GPs_base); - part->remote_GPs = NULL; + dev_err(xpc_chan, "can't get memory for remote connect args\n"); kfree(part->local_openclose_args_base); part->local_openclose_args = NULL; - dev_err(xpc_chan, "can't get memory for remote connect args\n"); + kfree(part->remote_GPs_base); + part->remote_GPs = NULL; + kfree(part->local_GPs_base); + part->local_GPs = NULL; + kfree(part->channels); + part->channels = NULL; return xpcNoMemory; } - memset(part->remote_openclose_args, 0, XPC_OPENCLOSE_ARGS_SIZE); xpc_initialize_channels(part, partid); @@ -186,18 +205,18 @@ xpc_setup_infrastructure(struct xpc_partition *part) ret = request_irq(SGI_XPC_NOTIFY, xpc_notify_IRQ_handler, SA_SHIRQ, part->IPI_owner, (void *) (u64) partid); if (ret != 0) { - kfree(part->channels); - part->channels = NULL; - kfree(part->local_GPs_base); - part->local_GPs = NULL; - kfree(part->remote_GPs_base); - part->remote_GPs = NULL; - kfree(part->local_openclose_args_base); - part->local_openclose_args = NULL; - kfree(part->remote_openclose_args_base); - part->remote_openclose_args = NULL; dev_err(xpc_chan, "can't register NOTIFY IRQ handler, " "errno=%d\n", -ret); + kfree(part->remote_openclose_args_base); + part->remote_openclose_args = NULL; + kfree(part->local_openclose_args_base); + part->local_openclose_args = NULL; + kfree(part->remote_GPs_base); + part->remote_GPs = NULL; + kfree(part->local_GPs_base); + part->local_GPs = NULL; + kfree(part->channels); + part->channels = NULL; return xpcLackOfResources; } @@ -446,22 +465,20 @@ xpc_allocate_local_msgqueue(struct xpc_channel *ch) for (nentries = ch->local_nentries; nentries > 0; nentries--) { nbytes = nentries * ch->msg_size; - ch->local_msgqueue = xpc_kmalloc_cacheline_aligned(nbytes, + ch->local_msgqueue = xpc_kzalloc_cacheline_aligned(nbytes, GFP_KERNEL, &ch->local_msgqueue_base); if (ch->local_msgqueue == NULL) { continue; } - memset(ch->local_msgqueue, 0, nbytes); nbytes = nentries * sizeof(struct xpc_notify); - ch->notify_queue = kmalloc(nbytes, GFP_KERNEL); + ch->notify_queue = kzalloc(nbytes, GFP_KERNEL); if (ch->notify_queue == NULL) { kfree(ch->local_msgqueue_base); ch->local_msgqueue = NULL; continue; } - memset(ch->notify_queue, 0, nbytes); spin_lock_irqsave(&ch->lock, irq_flags); if (nentries < ch->local_nentries) { @@ -501,13 +518,12 @@ xpc_allocate_remote_msgqueue(struct xpc_channel *ch) for (nentries = ch->remote_nentries; nentries > 0; nentries--) { nbytes = nentries * ch->msg_size; - ch->remote_msgqueue = xpc_kmalloc_cacheline_aligned(nbytes, + ch->remote_msgqueue = xpc_kzalloc_cacheline_aligned(nbytes, GFP_KERNEL, &ch->remote_msgqueue_base); if (ch->remote_msgqueue == NULL) { continue; } - memset(ch->remote_msgqueue, 0, nbytes); spin_lock_irqsave(&ch->lock, irq_flags); if (nentries < ch->remote_nentries) { diff --git a/arch/ia64/sn/kernel/xpc_main.c b/arch/ia64/sn/kernel/xpc_main.c index 8cbf1643257..99b123a6421 100644 --- a/arch/ia64/sn/kernel/xpc_main.c +++ b/arch/ia64/sn/kernel/xpc_main.c @@ -52,7 +52,6 @@ #include <linux/syscalls.h> #include <linux/cache.h> #include <linux/interrupt.h> -#include <linux/slab.h> #include <linux/delay.h> #include <linux/reboot.h> #include <linux/completion.h> diff --git a/arch/ia64/sn/kernel/xpc_partition.c b/arch/ia64/sn/kernel/xpc_partition.c index 88a730e6cfd..94211429fd0 100644 --- a/arch/ia64/sn/kernel/xpc_partition.c +++ b/arch/ia64/sn/kernel/xpc_partition.c @@ -81,6 +81,31 @@ char ____cacheline_aligned xpc_remote_copy_buffer[XPC_RP_HEADER_SIZE + /* + * Guarantee that the kmalloc'd memory is cacheline aligned. + */ +static void * +xpc_kmalloc_cacheline_aligned(size_t size, gfp_t flags, void **base) +{ + /* see if kmalloc will give us cachline aligned memory by default */ + *base = kmalloc(size, flags); + if (*base == NULL) { + return NULL; + } + if ((u64) *base == L1_CACHE_ALIGN((u64) *base)) { + return *base; + } + kfree(*base); + + /* nope, we'll have to do it ourselves */ + *base = kmalloc(size + L1_CACHE_BYTES, flags); + if (*base == NULL) { + return NULL; + } + return (void *) L1_CACHE_ALIGN((u64) *base); +} + + +/* * Given a nasid, get the physical address of the partition's reserved page * for that nasid. This function returns 0 on any error. */ @@ -1038,13 +1063,12 @@ xpc_discovery(void) remote_vars = (struct xpc_vars *) remote_rp; - discovered_nasids = kmalloc(sizeof(u64) * xp_nasid_mask_words, + discovered_nasids = kzalloc(sizeof(u64) * xp_nasid_mask_words, GFP_KERNEL); if (discovered_nasids == NULL) { kfree(remote_rp_base); return; } - memset(discovered_nasids, 0, sizeof(u64) * xp_nasid_mask_words); rp = (struct xpc_rsvd_page *) xpc_rsvd_page; diff --git a/arch/ia64/sn/pci/tioce_provider.c b/arch/ia64/sn/pci/tioce_provider.c index e52831ed93e..fa073cc4b56 100644 --- a/arch/ia64/sn/pci/tioce_provider.c +++ b/arch/ia64/sn/pci/tioce_provider.c @@ -15,6 +15,124 @@ #include <asm/sn/pcidev.h> #include <asm/sn/pcibus_provider_defs.h> #include <asm/sn/tioce_provider.h> +#include <asm/sn/sn2/sn_hwperf.h> + +/* + * 1/26/2006 + * + * WAR for SGI PV 944642. For revA TIOCE, need to use the following recipe + * (taken from the above PV) before and after accessing tioce internal MMR's + * to avoid tioce lockups. + * + * The recipe as taken from the PV: + * + * if(mmr address < 0x45000) { + * if(mmr address == 0 or 0x80) + * mmr wrt or read address 0xc0 + * else if(mmr address == 0x148 or 0x200) + * mmr wrt or read address 0x28 + * else + * mmr wrt or read address 0x158 + * + * do desired mmr access (rd or wrt) + * + * if(mmr address == 0x100) + * mmr wrt or read address 0x38 + * mmr wrt or read address 0xb050 + * } else + * do desired mmr access + * + * According to hw, we can use reads instead of writes to the above addres + * + * Note this WAR can only to be used for accessing internal MMR's in the + * TIOCE Coretalk Address Range 0x0 - 0x07ff_ffff. This includes the + * "Local CE Registers and Memories" and "PCI Compatible Config Space" address + * spaces from table 2-1 of the "CE Programmer's Reference Overview" document. + * + * All registers defined in struct tioce will meet that criteria. + */ + +static void inline +tioce_mmr_war_pre(struct tioce_kernel *kern, void *mmr_addr) +{ + u64 mmr_base; + u64 mmr_offset; + + if (kern->ce_common->ce_rev != TIOCE_REV_A) + return; + + mmr_base = kern->ce_common->ce_pcibus.bs_base; + mmr_offset = (u64)mmr_addr - mmr_base; + + if (mmr_offset < 0x45000) { + u64 mmr_war_offset; + + if (mmr_offset == 0 || mmr_offset == 0x80) + mmr_war_offset = 0xc0; + else if (mmr_offset == 0x148 || mmr_offset == 0x200) + mmr_war_offset = 0x28; + else + mmr_war_offset = 0x158; + + readq_relaxed((void *)(mmr_base + mmr_war_offset)); + } +} + +static void inline +tioce_mmr_war_post(struct tioce_kernel *kern, void *mmr_addr) +{ + u64 mmr_base; + u64 mmr_offset; + + if (kern->ce_common->ce_rev != TIOCE_REV_A) + return; + + mmr_base = kern->ce_common->ce_pcibus.bs_base; + mmr_offset = (u64)mmr_addr - mmr_base; + + if (mmr_offset < 0x45000) { + if (mmr_offset == 0x100) + readq_relaxed((void *)(mmr_base + 0x38)); + readq_relaxed((void *)(mmr_base + 0xb050)); + } +} + +/* load mmr contents into a variable */ +#define tioce_mmr_load(kern, mmrp, varp) do {\ + tioce_mmr_war_pre(kern, mmrp); \ + *(varp) = readq_relaxed(mmrp); \ + tioce_mmr_war_post(kern, mmrp); \ +} while (0) + +/* store variable contents into mmr */ +#define tioce_mmr_store(kern, mmrp, varp) do {\ + tioce_mmr_war_pre(kern, mmrp); \ + writeq(*varp, mmrp); \ + tioce_mmr_war_post(kern, mmrp); \ +} while (0) + +/* store immediate value into mmr */ +#define tioce_mmr_storei(kern, mmrp, val) do {\ + tioce_mmr_war_pre(kern, mmrp); \ + writeq(val, mmrp); \ + tioce_mmr_war_post(kern, mmrp); \ +} while (0) + +/* set bits (immediate value) into mmr */ +#define tioce_mmr_seti(kern, mmrp, bits) do {\ + u64 tmp; \ + tioce_mmr_load(kern, mmrp, &tmp); \ + tmp |= (bits); \ + tioce_mmr_store(kern, mmrp, &tmp); \ +} while (0) + +/* clear bits (immediate value) into mmr */ +#define tioce_mmr_clri(kern, mmrp, bits) do { \ + u64 tmp; \ + tioce_mmr_load(kern, mmrp, &tmp); \ + tmp &= ~(bits); \ + tioce_mmr_store(kern, mmrp, &tmp); \ +} while (0) /** * Bus address ranges for the 5 flavors of TIOCE DMA @@ -62,9 +180,9 @@ #define TIOCE_ATE_M40 2 #define TIOCE_ATE_M40S 3 -#define KB(x) ((x) << 10) -#define MB(x) ((x) << 20) -#define GB(x) ((x) << 30) +#define KB(x) ((u64)(x) << 10) +#define MB(x) ((u64)(x) << 20) +#define GB(x) ((u64)(x) << 30) /** * tioce_dma_d64 - create a DMA mapping using 64-bit direct mode @@ -151,7 +269,7 @@ tioce_alloc_map(struct tioce_kernel *ce_kern, int type, int port, int last; int entries; int nates; - int pagesize; + u64 pagesize; u64 *ate_shadow; u64 *ate_reg; u64 addr; @@ -228,7 +346,7 @@ tioce_alloc_map(struct tioce_kernel *ce_kern, int type, int port, ate = ATE_MAKE(addr, pagesize); ate_shadow[i + j] = ate; - writeq(ate, &ate_reg[i + j]); + tioce_mmr_storei(ce_kern, &ate_reg[i + j], ate); addr += pagesize; } @@ -272,7 +390,8 @@ tioce_dma_d32(struct pci_dev *pdev, u64 ct_addr) u64 tmp; ce_kern->ce_port[port].dirmap_shadow = ct_upper; - writeq(ct_upper, &ce_mmr->ce_ure_dir_map[port]); + tioce_mmr_storei(ce_kern, &ce_mmr->ce_ure_dir_map[port], + ct_upper); tmp = ce_mmr->ce_ure_dir_map[port]; dma_ok = 1; } else @@ -344,7 +463,8 @@ tioce_dma_unmap(struct pci_dev *pdev, dma_addr_t bus_addr, int dir) if (TIOCE_D32_ADDR(bus_addr)) { if (--ce_kern->ce_port[port].dirmap_refcnt == 0) { ce_kern->ce_port[port].dirmap_shadow = 0; - writeq(0, &ce_mmr->ce_ure_dir_map[port]); + tioce_mmr_storei(ce_kern, &ce_mmr->ce_ure_dir_map[port], + 0); } } else { struct tioce_dmamap *map; @@ -365,7 +485,7 @@ tioce_dma_unmap(struct pci_dev *pdev, dma_addr_t bus_addr, int dir) } else if (--map->refcnt == 0) { for (i = 0; i < map->ate_count; i++) { map->ate_shadow[i] = 0; - map->ate_hw[i] = 0; + tioce_mmr_storei(ce_kern, &map->ate_hw[i], 0); } list_del(&map->ce_dmamap_list); @@ -486,7 +606,7 @@ tioce_do_dma_map(struct pci_dev *pdev, u64 paddr, size_t byte_count, spin_unlock_irqrestore(&ce_kern->ce_lock, flags); dma_map_done: - if (mapaddr & barrier) + if (mapaddr && barrier) mapaddr = tioce_dma_barrier(mapaddr, 1); return mapaddr; @@ -541,17 +661,61 @@ tioce_error_intr_handler(int irq, void *arg, struct pt_regs *pt) soft->ce_pcibus.bs_persist_segment, soft->ce_pcibus.bs_persist_busnum, 0, 0, 0, 0, 0); + if (ret_stuff.v0) + panic("tioce_error_intr_handler: Fatal TIOCE error"); + return IRQ_HANDLED; } /** + * tioce_reserve_m32 - reserve M32 ate's for the indicated address range + * @tioce_kernel: TIOCE context to reserve ate's for + * @base: starting bus address to reserve + * @limit: last bus address to reserve + * + * If base/limit falls within the range of bus space mapped through the + * M32 space, reserve the resources corresponding to the range. + */ +static void +tioce_reserve_m32(struct tioce_kernel *ce_kern, u64 base, u64 limit) +{ + int ate_index, last_ate, ps; + struct tioce *ce_mmr; + + if (!TIOCE_M32_ADDR(base)) + return; + + ce_mmr = (struct tioce *)ce_kern->ce_common->ce_pcibus.bs_base; + ps = ce_kern->ce_ate3240_pagesize; + ate_index = ATE_PAGE(base, ps); + last_ate = ate_index + ATE_NPAGES(base, limit-base+1, ps) - 1; + + if (ate_index < 64) + ate_index = 64; + + while (ate_index <= last_ate) { + u64 ate; + + ate = ATE_MAKE(0xdeadbeef, ps); + ce_kern->ce_ate3240_shadow[ate_index] = ate; + tioce_mmr_storei(ce_kern, &ce_mmr->ce_ure_ate3240[ate_index], + ate); + ate_index++; + } +} + +/** * tioce_kern_init - init kernel structures related to a given TIOCE * @tioce_common: ptr to a cached tioce_common struct that originated in prom - */ static struct tioce_kernel * + */ +static struct tioce_kernel * tioce_kern_init(struct tioce_common *tioce_common) { int i; + int ps; + int dev; u32 tmp; + unsigned int seg, bus; struct tioce *tioce_mmr; struct tioce_kernel *tioce_kern; @@ -572,9 +736,10 @@ tioce_kern_init(struct tioce_common *tioce_common) * here to use pci_read_config_xxx() so use the raw_pci_ops vector. */ - raw_pci_ops->read(tioce_common->ce_pcibus.bs_persist_segment, - tioce_common->ce_pcibus.bs_persist_busnum, - PCI_DEVFN(2, 0), PCI_SECONDARY_BUS, 1, &tmp); + seg = tioce_common->ce_pcibus.bs_persist_segment; + bus = tioce_common->ce_pcibus.bs_persist_busnum; + + raw_pci_ops->read(seg, bus, PCI_DEVFN(2, 0), PCI_SECONDARY_BUS, 1,&tmp); tioce_kern->ce_port1_secondary = (u8) tmp; /* @@ -583,18 +748,76 @@ tioce_kern_init(struct tioce_common *tioce_common) */ tioce_mmr = (struct tioce *)tioce_common->ce_pcibus.bs_base; - __sn_clrq_relaxed(&tioce_mmr->ce_ure_page_map, CE_URE_PAGESIZE_MASK); - __sn_setq_relaxed(&tioce_mmr->ce_ure_page_map, CE_URE_256K_PAGESIZE); - tioce_kern->ce_ate3240_pagesize = KB(256); + tioce_mmr_clri(tioce_kern, &tioce_mmr->ce_ure_page_map, + CE_URE_PAGESIZE_MASK); + tioce_mmr_seti(tioce_kern, &tioce_mmr->ce_ure_page_map, + CE_URE_256K_PAGESIZE); + ps = tioce_kern->ce_ate3240_pagesize = KB(256); for (i = 0; i < TIOCE_NUM_M40_ATES; i++) { tioce_kern->ce_ate40_shadow[i] = 0; - writeq(0, &tioce_mmr->ce_ure_ate40[i]); + tioce_mmr_storei(tioce_kern, &tioce_mmr->ce_ure_ate40[i], 0); } for (i = 0; i < TIOCE_NUM_M3240_ATES; i++) { tioce_kern->ce_ate3240_shadow[i] = 0; - writeq(0, &tioce_mmr->ce_ure_ate3240[i]); + tioce_mmr_storei(tioce_kern, &tioce_mmr->ce_ure_ate3240[i], 0); + } + + /* + * Reserve ATE's corresponding to reserved address ranges. These + * include: + * + * Memory space covered by each PPB mem base/limit register + * Memory space covered by each PPB prefetch base/limit register + * + * These bus ranges are for pio (downstream) traffic only, and so + * cannot be used for DMA. + */ + + for (dev = 1; dev <= 2; dev++) { + u64 base, limit; + + /* mem base/limit */ + + raw_pci_ops->read(seg, bus, PCI_DEVFN(dev, 0), + PCI_MEMORY_BASE, 2, &tmp); + base = (u64)tmp << 16; + + raw_pci_ops->read(seg, bus, PCI_DEVFN(dev, 0), + PCI_MEMORY_LIMIT, 2, &tmp); + limit = (u64)tmp << 16; + limit |= 0xfffffUL; + + if (base < limit) + tioce_reserve_m32(tioce_kern, base, limit); + + /* + * prefetch mem base/limit. The tioce ppb's have 64-bit + * decoders, so read the upper portions w/o checking the + * attributes. + */ + + raw_pci_ops->read(seg, bus, PCI_DEVFN(dev, 0), + PCI_PREF_MEMORY_BASE, 2, &tmp); + base = ((u64)tmp & PCI_PREF_RANGE_MASK) << 16; + + raw_pci_ops->read(seg, bus, PCI_DEVFN(dev, 0), + PCI_PREF_BASE_UPPER32, 4, &tmp); + base |= (u64)tmp << 32; + + raw_pci_ops->read(seg, bus, PCI_DEVFN(dev, 0), + PCI_PREF_MEMORY_LIMIT, 2, &tmp); + + limit = ((u64)tmp & PCI_PREF_RANGE_MASK) << 16; + limit |= 0xfffffUL; + + raw_pci_ops->read(seg, bus, PCI_DEVFN(dev, 0), + PCI_PREF_LIMIT_UPPER32, 4, &tmp); + limit |= (u64)tmp << 32; + + if ((base < limit) && TIOCE_M32_ADDR(base)) + tioce_reserve_m32(tioce_kern, base, limit); } return tioce_kern; @@ -614,6 +837,7 @@ tioce_force_interrupt(struct sn_irq_info *sn_irq_info) { struct pcidev_info *pcidev_info; struct tioce_common *ce_common; + struct tioce_kernel *ce_kern; struct tioce *ce_mmr; u64 force_int_val; @@ -629,6 +853,29 @@ tioce_force_interrupt(struct sn_irq_info *sn_irq_info) ce_common = (struct tioce_common *)pcidev_info->pdi_pcibus_info; ce_mmr = (struct tioce *)ce_common->ce_pcibus.bs_base; + ce_kern = (struct tioce_kernel *)ce_common->ce_kernel_private; + + /* + * TIOCE Rev A workaround (PV 945826), force an interrupt by writing + * the TIO_INTx register directly (1/26/2006) + */ + if (ce_common->ce_rev == TIOCE_REV_A) { + u64 int_bit_mask = (1ULL << sn_irq_info->irq_int_bit); + u64 status; + + tioce_mmr_load(ce_kern, &ce_mmr->ce_adm_int_status, &status); + if (status & int_bit_mask) { + u64 force_irq = (1 << 8) | sn_irq_info->irq_irq; + u64 ctalk = sn_irq_info->irq_xtalkaddr; + u64 nasid, offset; + + nasid = (ctalk & CTALK_NASID_MASK) >> CTALK_NASID_SHFT; + offset = (ctalk & CTALK_NODE_OFFSET); + HUB_S(TIO_IOSPACE_ADDR(nasid, offset), force_irq); + } + + return; + } /* * irq_int_bit is originally set up by prom, and holds the interrupt @@ -666,7 +913,7 @@ tioce_force_interrupt(struct sn_irq_info *sn_irq_info) default: return; } - writeq(force_int_val, &ce_mmr->ce_adm_force_int); + tioce_mmr_storei(ce_kern, &ce_mmr->ce_adm_force_int, force_int_val); } /** @@ -685,6 +932,7 @@ tioce_target_interrupt(struct sn_irq_info *sn_irq_info) { struct pcidev_info *pcidev_info; struct tioce_common *ce_common; + struct tioce_kernel *ce_kern; struct tioce *ce_mmr; int bit; u64 vector; @@ -695,14 +943,15 @@ tioce_target_interrupt(struct sn_irq_info *sn_irq_info) ce_common = (struct tioce_common *)pcidev_info->pdi_pcibus_info; ce_mmr = (struct tioce *)ce_common->ce_pcibus.bs_base; + ce_kern = (struct tioce_kernel *)ce_common->ce_kernel_private; bit = sn_irq_info->irq_int_bit; - __sn_setq_relaxed(&ce_mmr->ce_adm_int_mask, (1UL << bit)); + tioce_mmr_seti(ce_kern, &ce_mmr->ce_adm_int_mask, (1UL << bit)); vector = (u64)sn_irq_info->irq_irq << INTR_VECTOR_SHFT; vector |= sn_irq_info->irq_xtalkaddr; - writeq(vector, &ce_mmr->ce_adm_int_dest[bit]); - __sn_clrq_relaxed(&ce_mmr->ce_adm_int_mask, (1UL << bit)); + tioce_mmr_storei(ce_kern, &ce_mmr->ce_adm_int_dest[bit], vector); + tioce_mmr_clri(ce_kern, &ce_mmr->ce_adm_int_mask, (1UL << bit)); tioce_force_interrupt(sn_irq_info); } @@ -721,7 +970,11 @@ tioce_target_interrupt(struct sn_irq_info *sn_irq_info) static void * tioce_bus_fixup(struct pcibus_bussoft *prom_bussoft, struct pci_controller *controller) { + int my_nasid; + cnodeid_t my_cnode, mem_cnode; struct tioce_common *tioce_common; + struct tioce_kernel *tioce_kern; + struct tioce *tioce_mmr; /* * Allocate kernel bus soft and copy from prom. @@ -734,11 +987,23 @@ tioce_bus_fixup(struct pcibus_bussoft *prom_bussoft, struct pci_controller *cont memcpy(tioce_common, prom_bussoft, sizeof(struct tioce_common)); tioce_common->ce_pcibus.bs_base |= __IA64_UNCACHED_OFFSET; - if (tioce_kern_init(tioce_common) == NULL) { + tioce_kern = tioce_kern_init(tioce_common); + if (tioce_kern == NULL) { kfree(tioce_common); return NULL; } + /* + * Clear out any transient errors before registering the error + * interrupt handler. + */ + + tioce_mmr = (struct tioce *)tioce_common->ce_pcibus.bs_base; + tioce_mmr_seti(tioce_kern, &tioce_mmr->ce_adm_int_status_alias, ~0ULL); + tioce_mmr_seti(tioce_kern, &tioce_mmr->ce_adm_error_summary_alias, + ~0ULL); + tioce_mmr_seti(tioce_kern, &tioce_mmr->ce_dre_comp_err_addr, ~0ULL); + if (request_irq(SGI_PCIASIC_ERROR, tioce_error_intr_handler, SA_SHIRQ, "TIOCE error", (void *)tioce_common)) @@ -750,6 +1015,21 @@ tioce_bus_fixup(struct pcibus_bussoft *prom_bussoft, struct pci_controller *cont tioce_common->ce_pcibus.bs_persist_segment, tioce_common->ce_pcibus.bs_persist_busnum); + /* + * identify closest nasid for memory allocations + */ + + my_nasid = NASID_GET(tioce_common->ce_pcibus.bs_base); + my_cnode = nasid_to_cnodeid(my_nasid); + + if (sn_hwperf_get_nearest_node(my_cnode, &mem_cnode, NULL) < 0) { + printk(KERN_WARNING "tioce_bus_fixup: failed to find " + "closest node with MEM to TIO node %d\n", my_cnode); + mem_cnode = (cnodeid_t)-1; /* use any node */ + } + + controller->node = mem_cnode; + return tioce_common; } diff --git a/arch/m32r/mm/init.c b/arch/m32r/mm/init.c index 6facf15b04f..c9e7dad860b 100644 --- a/arch/m32r/mm/init.c +++ b/arch/m32r/mm/init.c @@ -226,7 +226,7 @@ void free_initmem(void) addr = (unsigned long)(&__init_begin); for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) { ClearPageReserved(virt_to_page(addr)); - set_page_count(virt_to_page(addr), 1); + init_page_count(virt_to_page(addr)); free_page(addr); totalram_pages++; } @@ -244,7 +244,7 @@ void free_initrd_mem(unsigned long start, unsigned long end) unsigned long p; for (p = start; p < end; p += PAGE_SIZE) { ClearPageReserved(virt_to_page(p)); - set_page_count(virt_to_page(p), 1); + init_page_count(virt_to_page(p)); free_page(p); totalram_pages++; } diff --git a/arch/m68k/mm/init.c b/arch/m68k/mm/init.c index c45beb95594..a190e39c907 100644 --- a/arch/m68k/mm/init.c +++ b/arch/m68k/mm/init.c @@ -137,7 +137,7 @@ void free_initrd_mem(unsigned long start, unsigned long end) int pages = 0; for (; start < end; start += PAGE_SIZE) { ClearPageReserved(virt_to_page(start)); - set_page_count(virt_to_page(start), 1); + init_page_count(virt_to_page(start)); free_page(start); totalram_pages++; pages++; diff --git a/arch/m68k/mm/memory.c b/arch/m68k/mm/memory.c index 559942ce0e1..d6d582a5abb 100644 --- a/arch/m68k/mm/memory.c +++ b/arch/m68k/mm/memory.c @@ -54,7 +54,7 @@ void __init init_pointer_table(unsigned long ptable) /* unreserve the page so it's possible to free that page */ PD_PAGE(dp)->flags &= ~(1 << PG_reserved); - set_page_count(PD_PAGE(dp), 1); + init_page_count(PD_PAGE(dp)); return; } diff --git a/arch/m68k/mm/motorola.c b/arch/m68k/mm/motorola.c index d855fec2631..afb57eeafdc 100644 --- a/arch/m68k/mm/motorola.c +++ b/arch/m68k/mm/motorola.c @@ -276,7 +276,7 @@ void free_initmem(void) addr = (unsigned long)&__init_begin; for (; addr < (unsigned long)&__init_end; addr += PAGE_SIZE) { virt_to_page(addr)->flags &= ~(1 << PG_reserved); - set_page_count(virt_to_page(addr), 1); + init_page_count(virt_to_page(addr)); free_page(addr); totalram_pages++; } diff --git a/arch/m68knommu/kernel/m68k_ksyms.c b/arch/m68knommu/kernel/m68k_ksyms.c index eddb8d3e130..d844c755945 100644 --- a/arch/m68knommu/kernel/m68k_ksyms.c +++ b/arch/m68knommu/kernel/m68k_ksyms.c @@ -26,6 +26,7 @@ EXPORT_SYMBOL(__ioremap); EXPORT_SYMBOL(iounmap); EXPORT_SYMBOL(dump_fpu); EXPORT_SYMBOL(strnlen); +EXPORT_SYMBOL(strpbrk); EXPORT_SYMBOL(strrchr); EXPORT_SYMBOL(strstr); EXPORT_SYMBOL(strchr); diff --git a/arch/m68knommu/mm/init.c b/arch/m68knommu/mm/init.c index 89f0b554ffb..d79503fe6e4 100644 --- a/arch/m68knommu/mm/init.c +++ b/arch/m68knommu/mm/init.c @@ -195,7 +195,7 @@ void free_initrd_mem(unsigned long start, unsigned long end) int pages = 0; for (; start < end; start += PAGE_SIZE) { ClearPageReserved(virt_to_page(start)); - set_page_count(virt_to_page(start), 1); + init_page_count(virt_to_page(start)); free_page(start); totalram_pages++; pages++; @@ -218,7 +218,7 @@ free_initmem() /* next to check that the page we free is not a partial page */ for (; addr + PAGE_SIZE < (unsigned long)(&__init_end); addr +=PAGE_SIZE) { ClearPageReserved(virt_to_page(addr)); - set_page_count(virt_to_page(addr), 1); + init_page_count(virt_to_page(addr)); free_page(addr); totalram_pages++; } diff --git a/arch/mips/arc/memory.c b/arch/mips/arc/memory.c index 958d2eb7886..8a9ef58cc39 100644 --- a/arch/mips/arc/memory.c +++ b/arch/mips/arc/memory.c @@ -158,7 +158,7 @@ unsigned long __init prom_free_prom_memory(void) while (addr < boot_mem_map.map[i].addr + boot_mem_map.map[i].size) { ClearPageReserved(virt_to_page(__va(addr))); - set_page_count(virt_to_page(__va(addr)), 1); + init_page_count(virt_to_page(__va(addr))); free_page((unsigned long)__va(addr)); addr += PAGE_SIZE; freed += PAGE_SIZE; diff --git a/arch/mips/dec/prom/memory.c b/arch/mips/dec/prom/memory.c index 81cb5a76cfb..1edaf3074ee 100644 --- a/arch/mips/dec/prom/memory.c +++ b/arch/mips/dec/prom/memory.c @@ -118,7 +118,7 @@ unsigned long __init prom_free_prom_memory(void) addr = PAGE_SIZE; while (addr < end) { ClearPageReserved(virt_to_page(__va(addr))); - set_page_count(virt_to_page(__va(addr)), 1); + init_page_count(virt_to_page(__va(addr))); free_page((unsigned long)__va(addr)); addr += PAGE_SIZE; } diff --git a/arch/mips/mips-boards/generic/memory.c b/arch/mips/mips-boards/generic/memory.c index 2c8afd77a20..ee5e70c95cf 100644 --- a/arch/mips/mips-boards/generic/memory.c +++ b/arch/mips/mips-boards/generic/memory.c @@ -174,7 +174,7 @@ unsigned long __init prom_free_prom_memory(void) while (addr < boot_mem_map.map[i].addr + boot_mem_map.map[i].size) { ClearPageReserved(virt_to_page(__va(addr))); - set_page_count(virt_to_page(__va(addr)), 1); + init_page_count(virt_to_page(__va(addr))); free_page((unsigned long)__va(addr)); addr += PAGE_SIZE; freed += PAGE_SIZE; diff --git a/arch/mips/mips-boards/sim/sim_mem.c b/arch/mips/mips-boards/sim/sim_mem.c index 0dbd7435bb2..1ec4e75656b 100644 --- a/arch/mips/mips-boards/sim/sim_mem.c +++ b/arch/mips/mips-boards/sim/sim_mem.c @@ -117,7 +117,7 @@ unsigned long __init prom_free_prom_memory(void) while (addr < boot_mem_map.map[i].addr + boot_mem_map.map[i].size) { ClearPageReserved(virt_to_page(__va(addr))); - set_page_count(virt_to_page(__va(addr)), 1); + init_page_count(virt_to_page(__va(addr))); free_page((unsigned long)__va(addr)); addr += PAGE_SIZE; freed += PAGE_SIZE; diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c index 0ff9a348b84..52f7d59fe61 100644 --- a/arch/mips/mm/init.c +++ b/arch/mips/mm/init.c @@ -54,7 +54,8 @@ unsigned long empty_zero_page, zero_page_mask; */ unsigned long setup_zero_pages(void) { - unsigned long order, size; + unsigned int order; + unsigned long size; struct page *page; if (cpu_has_vce) @@ -67,9 +68,9 @@ unsigned long setup_zero_pages(void) panic("Oh boy, that early out of memory?"); page = virt_to_page(empty_zero_page); + split_page(page, order); while (page < virt_to_page(empty_zero_page + (PAGE_SIZE << order))) { SetPageReserved(page); - set_page_count(page, 1); page++; } @@ -244,7 +245,7 @@ void __init mem_init(void) #ifdef CONFIG_LIMITED_DMA set_page_address(page, lowmem_page_address(page)); #endif - set_page_count(page, 1); + init_page_count(page); __free_page(page); totalhigh_pages++; } @@ -291,7 +292,7 @@ void free_initrd_mem(unsigned long start, unsigned long end) for (; start < end; start += PAGE_SIZE) { ClearPageReserved(virt_to_page(start)); - set_page_count(virt_to_page(start), 1); + init_page_count(virt_to_page(start)); free_page(start); totalram_pages++; } @@ -314,7 +315,7 @@ void free_initmem(void) page = addr; #endif ClearPageReserved(virt_to_page(page)); - set_page_count(virt_to_page(page), 1); + init_page_count(virt_to_page(page)); free_page(page); totalram_pages++; freed += PAGE_SIZE; diff --git a/arch/mips/sgi-ip27/ip27-memory.c b/arch/mips/sgi-ip27/ip27-memory.c index ed93a979295..e0d095daa5e 100644 --- a/arch/mips/sgi-ip27/ip27-memory.c +++ b/arch/mips/sgi-ip27/ip27-memory.c @@ -559,7 +559,7 @@ void __init mem_init(void) /* if (!page_is_ram(pgnr)) continue; */ /* commented out until page_is_ram works */ ClearPageReserved(p); - set_page_count(p, 1); + init_page_count(p); __free_page(p); totalram_pages++; } diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c index 7847ca13d6c..852eda3953d 100644 --- a/arch/parisc/mm/init.c +++ b/arch/parisc/mm/init.c @@ -398,7 +398,7 @@ void free_initmem(void) addr = (unsigned long)(&__init_begin); for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) { ClearPageReserved(virt_to_page(addr)); - set_page_count(virt_to_page(addr), 1); + init_page_count(virt_to_page(addr)); free_page(addr); num_physpages++; totalram_pages++; @@ -1018,7 +1018,7 @@ void free_initrd_mem(unsigned long start, unsigned long end) printk(KERN_INFO "Freeing initrd memory: %ldk freed\n", (end - start) >> 10); for (; start < end; start += PAGE_SIZE) { ClearPageReserved(virt_to_page(start)); - set_page_count(virt_to_page(start), 1); + init_page_count(virt_to_page(start)); free_page(start); num_physpages++; totalram_pages++; diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index b51bb28c054..7370f9f33e2 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -133,21 +133,6 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, return __pte(old); } -/* - * This function checks for proper alignment of input addr and len parameters. - */ -int is_aligned_hugepage_range(unsigned long addr, unsigned long len) -{ - if (len & ~HPAGE_MASK) - return -EINVAL; - if (addr & ~HPAGE_MASK) - return -EINVAL; - if (! (within_hugepage_low_range(addr, len) - || within_hugepage_high_range(addr, len)) ) - return -EINVAL; - return 0; -} - struct slb_flush_info { struct mm_struct *mm; u16 newareas; diff --git a/arch/powerpc/mm/init_32.c b/arch/powerpc/mm/init_32.c index 7d0d75c1184..b57fb3a2b7b 100644 --- a/arch/powerpc/mm/init_32.c +++ b/arch/powerpc/mm/init_32.c @@ -216,7 +216,7 @@ static void free_sec(unsigned long start, unsigned long end, const char *name) while (start < end) { ClearPageReserved(virt_to_page(start)); - set_page_count(virt_to_page(start), 1); + init_page_count(virt_to_page(start)); free_page(start); cnt++; start += PAGE_SIZE; @@ -248,7 +248,7 @@ void free_initrd_mem(unsigned long start, unsigned long end) printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10); for (; start < end; start += PAGE_SIZE) { ClearPageReserved(virt_to_page(start)); - set_page_count(virt_to_page(start), 1); + init_page_count(virt_to_page(start)); free_page(start); totalram_pages++; } diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c index 81cfb0c2ec5..bacb71c8981 100644 --- a/arch/powerpc/mm/init_64.c +++ b/arch/powerpc/mm/init_64.c @@ -140,7 +140,7 @@ void free_initmem(void) for (; addr < (unsigned long)__init_end; addr += PAGE_SIZE) { memset((void *)addr, 0xcc, PAGE_SIZE); ClearPageReserved(virt_to_page(addr)); - set_page_count(virt_to_page(addr), 1); + init_page_count(virt_to_page(addr)); free_page(addr); totalram_pages++; } @@ -155,7 +155,7 @@ void free_initrd_mem(unsigned long start, unsigned long end) printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10); for (; start < end; start += PAGE_SIZE) { ClearPageReserved(virt_to_page(start)); - set_page_count(virt_to_page(start), 1); + init_page_count(virt_to_page(start)); free_page(start); totalram_pages++; } diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 550517c2dd4..454cac01d8c 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -108,8 +108,8 @@ EXPORT_SYMBOL(phys_mem_access_prot); void online_page(struct page *page) { ClearPageReserved(page); - set_page_count(page, 0); - free_cold_page(page); + init_page_count(page); + __free_page(page); totalram_pages++; num_physpages++; } @@ -376,7 +376,7 @@ void __init mem_init(void) struct page *page = pfn_to_page(pfn); ClearPageReserved(page); - set_page_count(page, 1); + init_page_count(page); __free_page(page); totalhigh_pages++; } diff --git a/arch/powerpc/platforms/cell/setup.c b/arch/powerpc/platforms/cell/setup.c index b33a4443f5a..fec8e65b36e 100644 --- a/arch/powerpc/platforms/cell/setup.c +++ b/arch/powerpc/platforms/cell/setup.c @@ -115,7 +115,7 @@ static void __init cell_spuprop_present(struct device_node *spe, for (pfn = start_pfn; pfn < end_pfn; pfn++) { struct page *page = pfn_to_page(pfn); set_page_links(page, ZONE_DMA, node_id, pfn); - set_page_count(page, 1); + init_page_count(page); reset_page_mapcount(page); SetPageReserved(page); INIT_LIST_HEAD(&page->lru); diff --git a/arch/ppc/kernel/dma-mapping.c b/arch/ppc/kernel/dma-mapping.c index 685fd0defe2..61465ec88bc 100644 --- a/arch/ppc/kernel/dma-mapping.c +++ b/arch/ppc/kernel/dma-mapping.c @@ -223,6 +223,8 @@ __dma_alloc_coherent(size_t size, dma_addr_t *handle, gfp_t gfp) pte_t *pte = consistent_pte + CONSISTENT_OFFSET(vaddr); struct page *end = page + (1 << order); + split_page(page, order); + /* * Set the "dma handle" */ @@ -231,7 +233,6 @@ __dma_alloc_coherent(size_t size, dma_addr_t *handle, gfp_t gfp) do { BUG_ON(!pte_none(*pte)); - set_page_count(page, 1); SetPageReserved(page); set_pte_at(&init_mm, vaddr, pte, mk_pte(page, pgprot_noncached(PAGE_KERNEL))); @@ -244,7 +245,6 @@ __dma_alloc_coherent(size_t size, dma_addr_t *handle, gfp_t gfp) * Free the otherwise unused pages. */ while (page < end) { - set_page_count(page, 1); __free_page(page); page++; } diff --git a/arch/ppc/mm/init.c b/arch/ppc/mm/init.c index 134db5c0420..cb1c294fb93 100644 --- a/arch/ppc/mm/init.c +++ b/arch/ppc/mm/init.c @@ -140,7 +140,7 @@ static void free_sec(unsigned long start, unsigned long end, const char *name) while (start < end) { ClearPageReserved(virt_to_page(start)); - set_page_count(virt_to_page(start), 1); + init_page_count(virt_to_page(start)); free_page(start); cnt++; start += PAGE_SIZE; @@ -172,7 +172,7 @@ void free_initrd_mem(unsigned long start, unsigned long end) for (; start < end; start += PAGE_SIZE) { ClearPageReserved(virt_to_page(start)); - set_page_count(virt_to_page(start), 1); + init_page_count(virt_to_page(start)); free_page(start); totalram_pages++; } @@ -441,7 +441,7 @@ void __init mem_init(void) struct page *page = mem_map + pfn; ClearPageReserved(page); - set_page_count(page, 1); + init_page_count(page); __free_page(page); totalhigh_pages++; } diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index df953383724..a055894f3bd 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -292,7 +292,7 @@ void free_initmem(void) addr = (unsigned long)(&__init_begin); for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) { ClearPageReserved(virt_to_page(addr)); - set_page_count(virt_to_page(addr), 1); + init_page_count(virt_to_page(addr)); free_page(addr); totalram_pages++; } @@ -307,7 +307,7 @@ void free_initrd_mem(unsigned long start, unsigned long end) printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10); for (; start < end; start += PAGE_SIZE) { ClearPageReserved(virt_to_page(start)); - set_page_count(virt_to_page(start), 1); + init_page_count(virt_to_page(start)); free_page(start); totalram_pages++; } diff --git a/arch/sh/mm/consistent.c b/arch/sh/mm/consistent.c index df3a9e452cc..ee73e30263a 100644 --- a/arch/sh/mm/consistent.c +++ b/arch/sh/mm/consistent.c @@ -23,6 +23,7 @@ void *consistent_alloc(gfp_t gfp, size_t size, dma_addr_t *handle) page = alloc_pages(gfp, order); if (!page) return NULL; + split_page(page, order); ret = page_address(page); *handle = virt_to_phys(ret); @@ -37,8 +38,6 @@ void *consistent_alloc(gfp_t gfp, size_t size, dma_addr_t *handle) end = page + (1 << order); while (++page < end) { - set_page_count(page, 1); - /* Free any unused pages */ if (page >= free) { __free_page(page); diff --git a/arch/sh/mm/hugetlbpage.c b/arch/sh/mm/hugetlbpage.c index 6b7a7688c98..a3568fd5150 100644 --- a/arch/sh/mm/hugetlbpage.c +++ b/arch/sh/mm/hugetlbpage.c @@ -84,18 +84,6 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, return entry; } -/* - * This function checks for proper alignment of input addr and len parameters. - */ -int is_aligned_hugepage_range(unsigned long addr, unsigned long len) -{ - if (len & ~HPAGE_MASK) - return -EINVAL; - if (addr & ~HPAGE_MASK) - return -EINVAL; - return 0; -} - struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) { diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c index e342565f75f..77b4a838fe1 100644 --- a/arch/sh/mm/init.c +++ b/arch/sh/mm/init.c @@ -273,7 +273,7 @@ void free_initmem(void) addr = (unsigned long)(&__init_begin); for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) { ClearPageReserved(virt_to_page(addr)); - set_page_count(virt_to_page(addr), 1); + init_page_count(virt_to_page(addr)); free_page(addr); totalram_pages++; } @@ -286,7 +286,7 @@ void free_initrd_mem(unsigned long start, unsigned long end) unsigned long p; for (p = start; p < end; p += PAGE_SIZE) { ClearPageReserved(virt_to_page(p)); - set_page_count(virt_to_page(p), 1); + init_page_count(virt_to_page(p)); free_page(p); totalram_pages++; } diff --git a/arch/sh64/mm/hugetlbpage.c b/arch/sh64/mm/hugetlbpage.c index ed6a505b3ee..3d89f2a6c78 100644 --- a/arch/sh64/mm/hugetlbpage.c +++ b/arch/sh64/mm/hugetlbpage.c @@ -84,18 +84,6 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, return entry; } -/* - * This function checks for proper alignment of input addr and len parameters. - */ -int is_aligned_hugepage_range(unsigned long addr, unsigned long len) -{ - if (len & ~HPAGE_MASK) - return -EINVAL; - if (addr & ~HPAGE_MASK) - return -EINVAL; - return 0; -} - struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) { diff --git a/arch/sh64/mm/init.c b/arch/sh64/mm/init.c index a65e8bb2c3c..1169757fb38 100644 --- a/arch/sh64/mm/init.c +++ b/arch/sh64/mm/init.c @@ -173,7 +173,7 @@ void free_initmem(void) addr = (unsigned long)(&__init_begin); for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) { ClearPageReserved(virt_to_page(addr)); - set_page_count(virt_to_page(addr), 1); + init_page_count(virt_to_page(addr)); free_page(addr); totalram_pages++; } @@ -186,7 +186,7 @@ void free_initrd_mem(unsigned long start, unsigned long end) unsigned long p; for (p = start; p < end; p += PAGE_SIZE) { ClearPageReserved(virt_to_page(p)); - set_page_count(virt_to_page(p), 1); + init_page_count(virt_to_page(p)); free_page(p); totalram_pages++; } diff --git a/arch/sparc/kernel/sun4d_smp.c b/arch/sparc/kernel/sun4d_smp.c index 40d426cce82..4219dd2ce3a 100644 --- a/arch/sparc/kernel/sun4d_smp.c +++ b/arch/sparc/kernel/sun4d_smp.c @@ -266,19 +266,19 @@ void __init smp4d_boot_cpus(void) /* Free unneeded trap tables */ ClearPageReserved(virt_to_page(trapbase_cpu1)); - set_page_count(virt_to_page(trapbase_cpu1), 1); + init_page_count(virt_to_page(trapbase_cpu1)); free_page((unsigned long)trapbase_cpu1); totalram_pages++; num_physpages++; ClearPageReserved(virt_to_page(trapbase_cpu2)); - set_page_count(virt_to_page(trapbase_cpu2), 1); + init_page_count(virt_to_page(trapbase_cpu2)); free_page((unsigned long)trapbase_cpu2); totalram_pages++; num_physpages++; ClearPageReserved(virt_to_page(trapbase_cpu3)); - set_page_count(virt_to_page(trapbase_cpu3), 1); + init_page_count(virt_to_page(trapbase_cpu3)); free_page((unsigned long)trapbase_cpu3); totalram_pages++; num_physpages++; diff --git a/arch/sparc/kernel/sun4m_smp.c b/arch/sparc/kernel/sun4m_smp.c index a21f27d10e5..fbbd8a474c4 100644 --- a/arch/sparc/kernel/sun4m_smp.c +++ b/arch/sparc/kernel/sun4m_smp.c @@ -233,21 +233,21 @@ void __init smp4m_boot_cpus(void) /* Free unneeded trap tables */ if (!cpu_isset(i, cpu_present_map)) { ClearPageReserved(virt_to_page(trapbase_cpu1)); - set_page_count(virt_to_page(trapbase_cpu1), 1); + init_page_count(virt_to_page(trapbase_cpu1)); free_page((unsigned long)trapbase_cpu1); totalram_pages++; num_physpages++; } if (!cpu_isset(2, cpu_present_map)) { ClearPageReserved(virt_to_page(trapbase_cpu2)); - set_page_count(virt_to_page(trapbase_cpu2), 1); + init_page_count(virt_to_page(trapbase_cpu2)); free_page((unsigned long)trapbase_cpu2); totalram_pages++; num_physpages++; } if (!cpu_isset(3, cpu_present_map)) { ClearPageReserved(virt_to_page(trapbase_cpu3)); - set_page_count(virt_to_page(trapbase_cpu3), 1); + init_page_count(virt_to_page(trapbase_cpu3)); free_page((unsigned long)trapbase_cpu3); totalram_pages++; num_physpages++; diff --git a/arch/sparc/mm/init.c b/arch/sparc/mm/init.c index c03babaa049..89866973246 100644 --- a/arch/sparc/mm/init.c +++ b/arch/sparc/mm/init.c @@ -383,7 +383,7 @@ void map_high_region(unsigned long start_pfn, unsigned long end_pfn) struct page *page = pfn_to_page(tmp); ClearPageReserved(page); - set_page_count(page, 1); + init_page_count(page); __free_page(page); totalhigh_pages++; } @@ -480,7 +480,7 @@ void free_initmem (void) p = virt_to_page(addr); ClearPageReserved(p); - set_page_count(p, 1); + init_page_count(p); __free_page(p); totalram_pages++; num_physpages++; @@ -497,7 +497,7 @@ void free_initrd_mem(unsigned long start, unsigned long end) struct page *p = virt_to_page(start); ClearPageReserved(p); - set_page_count(p, 1); + init_page_count(p); __free_page(p); num_physpages++; } diff --git a/arch/sparc64/mm/hugetlbpage.c b/arch/sparc64/mm/hugetlbpage.c index a7a24869d04..280dc7958a1 100644 --- a/arch/sparc64/mm/hugetlbpage.c +++ b/arch/sparc64/mm/hugetlbpage.c @@ -263,18 +263,6 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, return entry; } -/* - * This function checks for proper alignment of input addr and len parameters. - */ -int is_aligned_hugepage_range(unsigned long addr, unsigned long len) -{ - if (len & ~HPAGE_MASK) - return -EINVAL; - if (addr & ~HPAGE_MASK) - return -EINVAL; - return 0; -} - struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) { diff --git a/arch/sparc64/mm/init.c b/arch/sparc64/mm/init.c index c2b556106fc..2ae143ba50d 100644 --- a/arch/sparc64/mm/init.c +++ b/arch/sparc64/mm/init.c @@ -1461,7 +1461,7 @@ void free_initmem(void) p = virt_to_page(page); ClearPageReserved(p); - set_page_count(p, 1); + init_page_count(p); __free_page(p); num_physpages++; totalram_pages++; @@ -1477,7 +1477,7 @@ void free_initrd_mem(unsigned long start, unsigned long end) struct page *p = virt_to_page(start); ClearPageReserved(p); - set_page_count(p, 1); + init_page_count(p); __free_page(p); num_physpages++; totalram_pages++; diff --git a/arch/um/kernel/mem.c b/arch/um/kernel/mem.c index fa4f915be5c..92cce96b5e2 100644 --- a/arch/um/kernel/mem.c +++ b/arch/um/kernel/mem.c @@ -57,7 +57,7 @@ static void setup_highmem(unsigned long highmem_start, for(i = 0; i < highmem_len >> PAGE_SHIFT; i++){ page = &mem_map[highmem_pfn + i]; ClearPageReserved(page); - set_page_count(page, 1); + init_page_count(page); __free_page(page); } } @@ -296,7 +296,7 @@ void free_initrd_mem(unsigned long start, unsigned long end) (end - start) >> 10); for (; start < end; start += PAGE_SIZE) { ClearPageReserved(virt_to_page(start)); - set_page_count(virt_to_page(start), 1); + init_page_count(virt_to_page(start)); free_page(start); totalram_pages++; } diff --git a/arch/um/kernel/physmem.c b/arch/um/kernel/physmem.c index 544665e0451..0e65340eee3 100644 --- a/arch/um/kernel/physmem.c +++ b/arch/um/kernel/physmem.c @@ -279,7 +279,7 @@ int init_maps(unsigned long physmem, unsigned long iomem, unsigned long highmem) for(i = 0; i < total_pages; i++){ p = &map[i]; - set_page_count(p, 0); + memset(p, 0, sizeof(struct page)); SetPageReserved(p); INIT_LIST_HEAD(&p->lru); } diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c index 3080f84bf7b..ee5ce3d3cbc 100644 --- a/arch/x86_64/kernel/time.c +++ b/arch/x86_64/kernel/time.c @@ -477,7 +477,7 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs) return IRQ_HANDLED; } -static unsigned int cyc2ns_scale; +static unsigned int cyc2ns_scale __read_mostly; #define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */ static inline void set_cyc2ns_scale(unsigned long cpu_khz) diff --git a/arch/x86_64/kernel/x8664_ksyms.c b/arch/x86_64/kernel/x8664_ksyms.c index 3496abc8d37..c9dc7e46731 100644 --- a/arch/x86_64/kernel/x8664_ksyms.c +++ b/arch/x86_64/kernel/x8664_ksyms.c @@ -124,6 +124,7 @@ extern void * __memcpy(void *,const void *,__kernel_size_t); EXPORT_SYMBOL(memset); EXPORT_SYMBOL(strlen); +EXPORT_SYMBOL(strpbrk); EXPORT_SYMBOL(memmove); EXPORT_SYMBOL(memcpy); EXPORT_SYMBOL(__memcpy); diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c index 7af1742aa95..40ed13d263c 100644 --- a/arch/x86_64/mm/init.c +++ b/arch/x86_64/mm/init.c @@ -486,7 +486,7 @@ void __init clear_kernel_mapping(unsigned long address, unsigned long size) void online_page(struct page *page) { ClearPageReserved(page); - set_page_count(page, 1); + init_page_count(page); __free_page(page); totalram_pages++; num_physpages++; @@ -592,7 +592,7 @@ void free_initmem(void) addr = (unsigned long)(&__init_begin); for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) { ClearPageReserved(virt_to_page(addr)); - set_page_count(virt_to_page(addr), 1); + init_page_count(virt_to_page(addr)); memset((void *)(addr & ~(PAGE_SIZE-1)), 0xcc, PAGE_SIZE); free_page(addr); totalram_pages++; @@ -632,7 +632,7 @@ void free_initrd_mem(unsigned long start, unsigned long end) printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10); for (; start < end; start += PAGE_SIZE) { ClearPageReserved(virt_to_page(start)); - set_page_count(virt_to_page(start), 1); + init_page_count(virt_to_page(start)); free_page(start); totalram_pages++; } diff --git a/arch/x86_64/mm/pageattr.c b/arch/x86_64/mm/pageattr.c index 35f1f1aab06..531ad21447b 100644 --- a/arch/x86_64/mm/pageattr.c +++ b/arch/x86_64/mm/pageattr.c @@ -45,6 +45,13 @@ static struct page *split_large_page(unsigned long address, pgprot_t prot, pte_t *pbase; if (!base) return NULL; + /* + * page_private is used to track the number of entries in + * the page table page have non standard attributes. + */ + SetPagePrivate(base); + page_private(base) = 0; + address = __pa(address); addr = address & LARGE_PAGE_MASK; pbase = (pte_t *)page_address(base); @@ -77,26 +84,12 @@ static inline void flush_map(unsigned long address) on_each_cpu(flush_kernel_map, (void *)address, 1, 1); } -struct deferred_page { - struct deferred_page *next; - struct page *fpage; - unsigned long address; -}; -static struct deferred_page *df_list; /* protected by init_mm.mmap_sem */ +static struct page *deferred_pages; /* protected by init_mm.mmap_sem */ -static inline void save_page(unsigned long address, struct page *fpage) +static inline void save_page(struct page *fpage) { - struct deferred_page *df; - df = kmalloc(sizeof(struct deferred_page), GFP_KERNEL); - if (!df) { - flush_map(address); - __free_page(fpage); - } else { - df->next = df_list; - df->fpage = fpage; - df->address = address; - df_list = df; - } + fpage->lru.next = (struct list_head *)deferred_pages; + deferred_pages = fpage; } /* @@ -138,8 +131,8 @@ __change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot, set_pte(kpte, pfn_pte(pfn, prot)); } else { /* - * split_large_page will take the reference for this change_page_attr - * on the split page. + * split_large_page will take the reference for this + * change_page_attr on the split page. */ struct page *split; @@ -151,23 +144,20 @@ __change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot, set_pte(kpte,mk_pte(split, ref_prot2)); kpte_page = split; } - get_page(kpte_page); + page_private(kpte_page)++; } else if ((kpte_flags & _PAGE_PSE) == 0) { set_pte(kpte, pfn_pte(pfn, ref_prot)); - __put_page(kpte_page); + BUG_ON(page_private(kpte_page) == 0); + page_private(kpte_page)--; } else BUG(); /* on x86-64 the direct mapping set at boot is not using 4k pages */ BUG_ON(PageReserved(kpte_page)); - switch (page_count(kpte_page)) { - case 1: - save_page(address, kpte_page); + if (page_private(kpte_page) == 0) { + save_page(kpte_page); revert_page(address, ref_prot); - break; - case 0: - BUG(); /* memleak and failed 2M page regeneration */ } return 0; } @@ -220,17 +210,18 @@ int change_page_attr(struct page *page, int numpages, pgprot_t prot) void global_flush_tlb(void) { - struct deferred_page *df, *next_df; + struct page *dpage; down_read(&init_mm.mmap_sem); - df = xchg(&df_list, NULL); + dpage = xchg(&deferred_pages, NULL); up_read(&init_mm.mmap_sem); - flush_map((df && !df->next) ? df->address : 0); - for (; df; df = next_df) { - next_df = df->next; - if (df->fpage) - __free_page(df->fpage); - kfree(df); + + flush_map((dpage && !dpage->lru.next) ? (unsigned long)page_address(dpage) : 0); + while (dpage) { + struct page *tmp = dpage; + dpage = (struct page *)dpage->lru.next; + ClearPagePrivate(tmp); + __free_page(tmp); } } diff --git a/arch/xtensa/mm/init.c b/arch/xtensa/mm/init.c index 5a91d6c9e66..e1be4235f36 100644 --- a/arch/xtensa/mm/init.c +++ b/arch/xtensa/mm/init.c @@ -272,7 +272,7 @@ free_reserved_mem(void *start, void *end) { for (; start < end; start += PAGE_SIZE) { ClearPageReserved(virt_to_page(start)); - set_page_count(virt_to_page(start), 1); + init_page_count(virt_to_page(start)); free_page((unsigned long)start); totalram_pages++; } diff --git a/arch/xtensa/mm/pgtable.c b/arch/xtensa/mm/pgtable.c index e5e119c820e..7d28914d11c 100644 --- a/arch/xtensa/mm/pgtable.c +++ b/arch/xtensa/mm/pgtable.c @@ -14,25 +14,21 @@ pte_t* pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) { - pte_t *pte, p; + pte_t *pte = NULL, *p; int color = ADDR_COLOR(address); int i; p = (pte_t*) __get_free_pages(GFP_KERNEL|__GFP_REPEAT, COLOR_ORDER); if (likely(p)) { - struct page *page; - - for (i = 0; i < COLOR_SIZE; i++, p++) { - page = virt_to_page(pte); - - set_page_count(page, 1); - ClearPageCompound(page); + split_page(virt_to_page(p), COLOR_ORDER); + for (i = 0; i < COLOR_SIZE; i++) { if (ADDR_COLOR(p) == color) pte = p; else free_page(p); + p += PTRS_PER_PTE; } clear_page(pte); } @@ -49,20 +45,20 @@ int flush; struct page* pte_alloc_one(struct mm_struct *mm, unsigned long address) { - struct page *page, p; + struct page *page = NULL, *p; int color = ADDR_COLOR(address); p = alloc_pages(GFP_KERNEL | __GFP_REPEAT, PTE_ORDER); if (likely(p)) { - for (i = 0; i < PAGE_ORDER; i++) { - set_page_count(p, 1); - ClearPageCompound(p); + split_page(p, COLOR_ORDER); - if (PADDR_COLOR(page_address(pg)) == color) + for (i = 0; i < PAGE_ORDER; i++) { + if (PADDR_COLOR(page_address(p)) == color) page = p; else - free_page(p); + __free_page(p); + p++; } clear_highpage(page); } diff --git a/drivers/char/snsc.h b/drivers/char/snsc.h index a9efc13cc85..8a98169b60c 100644 --- a/drivers/char/snsc.h +++ b/drivers/char/snsc.h @@ -5,7 +5,7 @@ * License. See the file "COPYING" in the main directory of this archive * for more details. * - * Copyright (C) 2004 Silicon Graphics, Inc. All rights reserved. + * Copyright (C) 2004-2006 Silicon Graphics, Inc. All rights reserved. */ /* @@ -70,6 +70,9 @@ struct sysctl_data_s { #define EV_CLASS_TEST_WARNING 0x6000ul #define EV_CLASS_PWRD_NOTIFY 0x8000ul +/* ENV class codes */ +#define ENV_PWRDN_PEND 0x4101ul + #define EV_SEVERITY_POWER_STABLE 0x0000ul #define EV_SEVERITY_POWER_LOW_WARNING 0x0100ul #define EV_SEVERITY_POWER_HIGH_WARNING 0x0200ul diff --git a/drivers/char/snsc_event.c b/drivers/char/snsc_event.c index baaa365285f..a4fa507eed9 100644 --- a/drivers/char/snsc_event.c +++ b/drivers/char/snsc_event.c @@ -5,7 +5,7 @@ * License. See the file "COPYING" in the main directory of this archive * for more details. * - * Copyright (C) 2004 Silicon Graphics, Inc. All rights reserved. + * Copyright (C) 2004-2006 Silicon Graphics, Inc. All rights reserved. */ /* @@ -187,7 +187,8 @@ scdrv_event_severity(int code) static void scdrv_dispatch_event(char *event, int len) { - int code, esp_code, src; + static int snsc_shutting_down = 0; + int code, esp_code, src, class; char desc[CHUNKSIZE]; char *severity; @@ -199,9 +200,25 @@ scdrv_dispatch_event(char *event, int len) /* how urgent is the message? */ severity = scdrv_event_severity(code); - if ((code & EV_CLASS_MASK) == EV_CLASS_PWRD_NOTIFY) { + class = (code & EV_CLASS_MASK); + + if (class == EV_CLASS_PWRD_NOTIFY || code == ENV_PWRDN_PEND) { struct task_struct *p; + if (snsc_shutting_down) + return; + + snsc_shutting_down = 1; + + /* give a message for each type of event */ + if (class == EV_CLASS_PWRD_NOTIFY) + printk(KERN_NOTICE "Power off indication received." + " Sending SIGPWR to init...\n"); + else if (code == ENV_PWRDN_PEND) + printk(KERN_CRIT "WARNING: Shutting down the system" + " due to a critical environmental condition." + " Sending SIGPWR to init...\n"); + /* give a SIGPWR signal to init proc */ /* first find init's task */ @@ -210,12 +227,11 @@ scdrv_dispatch_event(char *event, int len) if (p->pid == 1) break; } - if (p) { /* we found init's task */ - printk(KERN_EMERG "Power off indication received. Initiating power fail sequence...\n"); + if (p) { force_sig(SIGPWR, p); - } else { /* failed to find init's task - just give message(s) */ - printk(KERN_WARNING "Failed to find init proc to handle power off!\n"); - printk("%s|$(0x%x)%s\n", severity, esp_code, desc); + } else { + printk(KERN_ERR "Failed to signal init!\n"); + snsc_shutting_down = 0; /* so can try again (?) */ } read_unlock(&tasklist_lock); } else { diff --git a/drivers/char/tb0219.c b/drivers/char/tb0219.c index ac2a297ce37..a80c8321087 100644 --- a/drivers/char/tb0219.c +++ b/drivers/char/tb0219.c @@ -283,7 +283,7 @@ static void tb0219_pci_irq_init(void) vr41xx_set_irq_level(TB0219_PCI_SLOT3_PIN, IRQ_LEVEL_LOW); } -static int tb0219_probe(struct platform_device *dev) +static int __devinit tb0219_probe(struct platform_device *dev) { int retval; @@ -319,7 +319,7 @@ static int tb0219_probe(struct platform_device *dev) return 0; } -static int tb0219_remove(struct platform_device *dev) +static int __devexit tb0219_remove(struct platform_device *dev) { _machine_restart = old_machine_restart; @@ -335,19 +335,26 @@ static struct platform_device *tb0219_platform_device; static struct platform_driver tb0219_device_driver = { .probe = tb0219_probe, - .remove = tb0219_remove, + .remove = __devexit_p(tb0219_remove), .driver = { .name = "TB0219", + .owner = THIS_MODULE, }, }; -static int __devinit tanbac_tb0219_init(void) +static int __init tanbac_tb0219_init(void) { int retval; - tb0219_platform_device = platform_device_register_simple("TB0219", -1, NULL, 0); - if (IS_ERR(tb0219_platform_device)) - return PTR_ERR(tb0219_platform_device); + tb0219_platform_device = platform_device_alloc("TB0219", -1); + if (!tb0219_platform_device) + return -ENOMEM; + + retval = platform_device_add(tb0219_platform_device); + if (retval < 0) { + platform_device_put(tb0219_platform_device); + return retval; + } retval = platform_driver_register(&tb0219_device_driver); if (retval < 0) @@ -356,10 +363,9 @@ static int __devinit tanbac_tb0219_init(void) return retval; } -static void __devexit tanbac_tb0219_exit(void) +static void __exit tanbac_tb0219_exit(void) { platform_driver_unregister(&tb0219_device_driver); - platform_device_unregister(tb0219_platform_device); } diff --git a/drivers/char/vr41xx_giu.c b/drivers/char/vr41xx_giu.c index 2267c7b8179..05e6e814d86 100644 --- a/drivers/char/vr41xx_giu.c +++ b/drivers/char/vr41xx_giu.c @@ -613,7 +613,7 @@ static struct file_operations gpio_fops = { .release = gpio_release, }; -static int giu_probe(struct platform_device *dev) +static int __devinit giu_probe(struct platform_device *dev) { unsigned long start, size, flags = 0; unsigned int nr_pins = 0; @@ -697,7 +697,7 @@ static int giu_probe(struct platform_device *dev) return cascade_irq(GIUINT_IRQ, giu_get_irq); } -static int giu_remove(struct platform_device *dev) +static int __devexit giu_remove(struct platform_device *dev) { iounmap(giu_base); @@ -712,9 +712,10 @@ static struct platform_device *giu_platform_device; static struct platform_driver giu_device_driver = { .probe = giu_probe, - .remove = giu_remove, + .remove = __devexit_p(giu_remove), .driver = { .name = "GIU", + .owner = THIS_MODULE, }, }; @@ -722,9 +723,15 @@ static int __init vr41xx_giu_init(void) { int retval; - giu_platform_device = platform_device_register_simple("GIU", -1, NULL, 0); - if (IS_ERR(giu_platform_device)) - return PTR_ERR(giu_platform_device); + giu_platform_device = platform_device_alloc("GIU", -1); + if (!giu_platform_device) + return -ENOMEM; + + retval = platform_device_add(giu_platform_device); + if (retval < 0) { + platform_device_put(giu_platform_device); + return retval; + } retval = platform_driver_register(&giu_device_driver); if (retval < 0) diff --git a/drivers/char/vr41xx_rtc.c b/drivers/char/vr41xx_rtc.c index bc1b4a15212..b109d9a502d 100644 --- a/drivers/char/vr41xx_rtc.c +++ b/drivers/char/vr41xx_rtc.c @@ -558,7 +558,7 @@ static struct miscdevice rtc_miscdevice = { .fops = &rtc_fops, }; -static int rtc_probe(struct platform_device *pdev) +static int __devinit rtc_probe(struct platform_device *pdev) { unsigned int irq; int retval; @@ -631,7 +631,7 @@ static int rtc_probe(struct platform_device *pdev) return 0; } -static int rtc_remove(struct platform_device *dev) +static int __devexit rtc_remove(struct platform_device *dev) { int retval; @@ -653,13 +653,14 @@ static struct platform_device *rtc_platform_device; static struct platform_driver rtc_device_driver = { .probe = rtc_probe, - .remove = rtc_remove, + .remove = __devexit_p(rtc_remove), .driver = { .name = rtc_name, + .owner = THIS_MODULE, }, }; -static int __devinit vr41xx_rtc_init(void) +static int __init vr41xx_rtc_init(void) { int retval; @@ -684,10 +685,20 @@ static int __devinit vr41xx_rtc_init(void) break; } - rtc_platform_device = platform_device_register_simple("RTC", -1, - rtc_resource, ARRAY_SIZE(rtc_resource)); - if (IS_ERR(rtc_platform_device)) - return PTR_ERR(rtc_platform_device); + rtc_platform_device = platform_device_alloc("RTC", -1); + if (!rtc_platform_device) + return -ENOMEM; + + retval = platform_device_add_resources(rtc_platform_device, + rtc_resource, ARRAY_SIZE(rtc_resource)); + + if (retval == 0) + retval = platform_device_add(rtc_platform_device); + + if (retval < 0) { + platform_device_put(rtc_platform_device); + return retval; + } retval = platform_driver_register(&rtc_device_driver); if (retval < 0) @@ -696,10 +707,9 @@ static int __devinit vr41xx_rtc_init(void) return retval; } -static void __devexit vr41xx_rtc_exit(void) +static void __exit vr41xx_rtc_exit(void) { platform_driver_unregister(&rtc_device_driver); - platform_device_unregister(rtc_platform_device); } diff --git a/drivers/char/watchdog/mv64x60_wdt.c b/drivers/char/watchdog/mv64x60_wdt.c index 00d9ef04a36..f1b9cf89f15 100644 --- a/drivers/char/watchdog/mv64x60_wdt.c +++ b/drivers/char/watchdog/mv64x60_wdt.c @@ -228,15 +228,25 @@ static int __init mv64x60_wdt_init(void) printk(KERN_INFO "MV64x60 watchdog driver\n"); - mv64x60_wdt_dev = platform_device_register_simple(MV64x60_WDT_NAME, - -1, NULL, 0); - if (IS_ERR(mv64x60_wdt_dev)) { - ret = PTR_ERR(mv64x60_wdt_dev); + mv64x60_wdt_dev = platform_device_alloc(MV64x60_WDT_NAME, -1); + if (!mv64x60_wdt_dev) { + ret = -ENOMEM; + goto out; + } + + ret = platform_device_add(mv64x60_wdt_dev); + if (ret) { + platform_device_put(mv64x60_wdt_dev); goto out; } ret = platform_driver_register(&mv64x60_wdt_driver); - out: + if (ret) { + platform_device_unregister(mv64x60_wdt_dev); + goto out; + } + + out: return ret; } diff --git a/drivers/firmware/dcdbas.c b/drivers/firmware/dcdbas.c index 4652512f7d1..3a4e5c5b4e1 100644 --- a/drivers/firmware/dcdbas.c +++ b/drivers/firmware/dcdbas.c @@ -530,30 +530,27 @@ static DCDBAS_DEV_ATTR_RW(host_control_action); static DCDBAS_DEV_ATTR_RW(host_control_smi_type); static DCDBAS_DEV_ATTR_RW(host_control_on_shutdown); -static struct device_attribute *dcdbas_dev_attrs[] = { - &dev_attr_smi_data_buf_size, - &dev_attr_smi_data_buf_phys_addr, - &dev_attr_smi_request, - &dev_attr_host_control_action, - &dev_attr_host_control_smi_type, - &dev_attr_host_control_on_shutdown, +static struct attribute *dcdbas_dev_attrs[] = { + &dev_attr_smi_data_buf_size.attr, + &dev_attr_smi_data_buf_phys_addr.attr, + &dev_attr_smi_request.attr, + &dev_attr_host_control_action.attr, + &dev_attr_host_control_smi_type.attr, + &dev_attr_host_control_on_shutdown.attr, NULL }; -/** - * dcdbas_init: initialize driver - */ -static int __init dcdbas_init(void) +static struct attribute_group dcdbas_attr_group = { + .attrs = dcdbas_dev_attrs, +}; + +static int __devinit dcdbas_probe(struct platform_device *dev) { - int i; + int i, error; host_control_action = HC_ACTION_NONE; host_control_smi_type = HC_SMITYPE_NONE; - dcdbas_pdev = platform_device_register_simple(DRIVER_NAME, -1, NULL, 0); - if (IS_ERR(dcdbas_pdev)) - return PTR_ERR(dcdbas_pdev); - /* * BIOS SMI calls require buffer addresses be in 32-bit address space. * This is done by setting the DMA mask below. @@ -561,19 +558,79 @@ static int __init dcdbas_init(void) dcdbas_pdev->dev.coherent_dma_mask = DMA_32BIT_MASK; dcdbas_pdev->dev.dma_mask = &dcdbas_pdev->dev.coherent_dma_mask; + error = sysfs_create_group(&dev->dev.kobj, &dcdbas_attr_group); + if (error) + return error; + + for (i = 0; dcdbas_bin_attrs[i]; i++) { + error = sysfs_create_bin_file(&dev->dev.kobj, + dcdbas_bin_attrs[i]); + if (error) { + while (--i >= 0) + sysfs_remove_bin_file(&dev->dev.kobj, + dcdbas_bin_attrs[i]); + sysfs_create_group(&dev->dev.kobj, &dcdbas_attr_group); + return error; + } + } + register_reboot_notifier(&dcdbas_reboot_nb); + dev_info(&dev->dev, "%s (version %s)\n", + DRIVER_DESCRIPTION, DRIVER_VERSION); + + return 0; +} + +static int __devexit dcdbas_remove(struct platform_device *dev) +{ + int i; + + unregister_reboot_notifier(&dcdbas_reboot_nb); for (i = 0; dcdbas_bin_attrs[i]; i++) - sysfs_create_bin_file(&dcdbas_pdev->dev.kobj, - dcdbas_bin_attrs[i]); + sysfs_remove_bin_file(&dev->dev.kobj, dcdbas_bin_attrs[i]); + sysfs_remove_group(&dev->dev.kobj, &dcdbas_attr_group); - for (i = 0; dcdbas_dev_attrs[i]; i++) - device_create_file(&dcdbas_pdev->dev, dcdbas_dev_attrs[i]); + return 0; +} - dev_info(&dcdbas_pdev->dev, "%s (version %s)\n", - DRIVER_DESCRIPTION, DRIVER_VERSION); +static struct platform_driver dcdbas_driver = { + .driver = { + .name = DRIVER_NAME, + .owner = THIS_MODULE, + }, + .probe = dcdbas_probe, + .remove = __devexit_p(dcdbas_remove), +}; + +/** + * dcdbas_init: initialize driver + */ +static int __init dcdbas_init(void) +{ + int error; + + error = platform_driver_register(&dcdbas_driver); + if (error) + return error; + + dcdbas_pdev = platform_device_alloc(DRIVER_NAME, -1); + if (!dcdbas_pdev) { + error = -ENOMEM; + goto err_unregister_driver; + } + + error = platform_device_add(dcdbas_pdev); + if (error) + goto err_free_device; return 0; + + err_free_device: + platform_device_put(dcdbas_pdev); + err_unregister_driver: + platform_driver_unregister(&dcdbas_driver); + return error; } /** @@ -588,6 +645,15 @@ static void __exit dcdbas_exit(void) unregister_reboot_notifier(&dcdbas_reboot_nb); smi_data_buf_free(); platform_device_unregister(dcdbas_pdev); + platform_driver_unregister(&dcdbas_driver); + + /* + * We have to free the buffer here instead of dcdbas_remove + * because only in module exit function we can be sure that + * all sysfs attributes belonging to this module have been + * released. + */ + smi_data_buf_free(); } module_init(dcdbas_init); diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 88d60202b9d..26b08ee425c 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -533,30 +533,35 @@ static void __clone_and_map(struct clone_info *ci) } else { /* - * Create two copy bios to deal with io that has - * been split across a target. + * Handle a bvec that must be split between two or more targets. */ struct bio_vec *bv = bio->bi_io_vec + ci->idx; + sector_t remaining = to_sector(bv->bv_len); + unsigned int offset = 0; - clone = split_bvec(bio, ci->sector, ci->idx, - bv->bv_offset, max); - __map_bio(ti, clone, tio); + do { + if (offset) { + ti = dm_table_find_target(ci->map, ci->sector); + max = max_io_len(ci->md, ci->sector, ti); - ci->sector += max; - ci->sector_count -= max; - ti = dm_table_find_target(ci->map, ci->sector); - - len = to_sector(bv->bv_len) - max; - clone = split_bvec(bio, ci->sector, ci->idx, - bv->bv_offset + to_bytes(max), len); - tio = alloc_tio(ci->md); - tio->io = ci->io; - tio->ti = ti; - memset(&tio->info, 0, sizeof(tio->info)); - __map_bio(ti, clone, tio); + tio = alloc_tio(ci->md); + tio->io = ci->io; + tio->ti = ti; + memset(&tio->info, 0, sizeof(tio->info)); + } + + len = min(remaining, max); + + clone = split_bvec(bio, ci->sector, ci->idx, + bv->bv_offset + offset, len); + + __map_bio(ti, clone, tio); + + ci->sector += len; + ci->sector_count -= len; + offset += to_bytes(len); + } while (remaining -= len); - ci->sector += len; - ci->sector_count -= len; ci->idx++; } } diff --git a/drivers/media/dvb/bt8xx/Makefile b/drivers/media/dvb/bt8xx/Makefile index 9d197efb481..d188e4c670b 100644 --- a/drivers/media/dvb/bt8xx/Makefile +++ b/drivers/media/dvb/bt8xx/Makefile @@ -1,3 +1,3 @@ obj-$(CONFIG_DVB_BT8XX) += bt878.o dvb-bt8xx.o dst.o dst_ca.o -EXTRA_CFLAGS = -Idrivers/media/dvb/dvb-core/ -Idrivers/media/video/bt8xx -Idrivers/media/dvb/frontends +EXTRA_CFLAGS = -Idrivers/media/dvb/dvb-core/ -Idrivers/media/video -Idrivers/media/dvb/frontends diff --git a/drivers/net/mv643xx_eth.h b/drivers/net/mv643xx_eth.h index 7754d1974b9..4262c1da6d4 100644 --- a/drivers/net/mv643xx_eth.h +++ b/drivers/net/mv643xx_eth.h @@ -42,13 +42,23 @@ #define MAX_DESCS_PER_SKB 1 #endif +/* + * The MV643XX HW requires 8-byte alignment. However, when I/O + * is non-cache-coherent, we need to ensure that the I/O buffers + * we use don't share cache lines with other data. + */ +#if defined(CONFIG_DMA_NONCOHERENT) || defined(CONFIG_NOT_COHERENT_CACHE) +#define ETH_DMA_ALIGN L1_CACHE_BYTES +#else +#define ETH_DMA_ALIGN 8 +#endif + #define ETH_VLAN_HLEN 4 #define ETH_FCS_LEN 4 -#define ETH_DMA_ALIGN 8 /* hw requires 8-byte alignment */ -#define ETH_HW_IP_ALIGN 2 /* hw aligns IP header */ +#define ETH_HW_IP_ALIGN 2 /* hw aligns IP header */ #define ETH_WRAPPER_LEN (ETH_HW_IP_ALIGN + ETH_HLEN + \ - ETH_VLAN_HLEN + ETH_FCS_LEN) -#define ETH_RX_SKB_SIZE ((dev->mtu + ETH_WRAPPER_LEN + 7) & ~0x7) + ETH_VLAN_HLEN + ETH_FCS_LEN) +#define ETH_RX_SKB_SIZE (dev->mtu + ETH_WRAPPER_LEN + ETH_DMA_ALIGN) #define ETH_RX_QUEUES_ENABLED (1 << 0) /* use only Q0 for receive */ #define ETH_TX_QUEUES_ENABLED (1 << 0) /* use only Q0 for transmit */ diff --git a/drivers/net/pcnet32.c b/drivers/net/pcnet32.c index 7e900572eaf..9595f74da93 100644 --- a/drivers/net/pcnet32.c +++ b/drivers/net/pcnet32.c @@ -22,12 +22,12 @@ *************************************************************************/ #define DRV_NAME "pcnet32" -#define DRV_VERSION "1.31c" -#define DRV_RELDATE "01.Nov.2005" +#define DRV_VERSION "1.32" +#define DRV_RELDATE "18.Mar.2006" #define PFX DRV_NAME ": " -static const char * const version = -DRV_NAME ".c:v" DRV_VERSION " " DRV_RELDATE " tsbogend@alpha.franken.de\n"; +static const char *const version = + DRV_NAME ".c:v" DRV_VERSION " " DRV_RELDATE " tsbogend@alpha.franken.de\n"; #include <linux/module.h> #include <linux/kernel.h> @@ -58,18 +58,23 @@ DRV_NAME ".c:v" DRV_VERSION " " DRV_RELDATE " tsbogend@alpha.franken.de\n"; * PCI device identifiers for "new style" Linux PCI Device Drivers */ static struct pci_device_id pcnet32_pci_tbl[] = { - { PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_LANCE_HOME, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0 }, - { PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_LANCE, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0 }, - /* - * Adapters that were sold with IBM's RS/6000 or pSeries hardware have - * the incorrect vendor id. - */ - { PCI_VENDOR_ID_TRIDENT, PCI_DEVICE_ID_AMD_LANCE, PCI_ANY_ID, PCI_ANY_ID, - PCI_CLASS_NETWORK_ETHERNET << 8, 0xffff00, 0 }, - { 0, } + { PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_LANCE_HOME, + PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, + { PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_LANCE, + PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, + + /* + * Adapters that were sold with IBM's RS/6000 or pSeries hardware have + * the incorrect vendor id. + */ + { PCI_VENDOR_ID_TRIDENT, PCI_DEVICE_ID_AMD_LANCE, + PCI_ANY_ID, PCI_ANY_ID, + PCI_CLASS_NETWORK_ETHERNET << 8, 0xffff00, 0}, + + { } /* terminate list */ }; -MODULE_DEVICE_TABLE (pci, pcnet32_pci_tbl); +MODULE_DEVICE_TABLE(pci, pcnet32_pci_tbl); static int cards_found; @@ -77,13 +82,11 @@ static int cards_found; * VLB I/O addresses */ static unsigned int pcnet32_portlist[] __initdata = - { 0x300, 0x320, 0x340, 0x360, 0 }; - - + { 0x300, 0x320, 0x340, 0x360, 0 }; static int pcnet32_debug = 0; -static int tx_start = 1; /* Mapping -- 0:20, 1:64, 2:128, 3:~220 (depends on chip vers) */ -static int pcnet32vlb; /* check for VLB cards ? */ +static int tx_start = 1; /* Mapping -- 0:20, 1:64, 2:128, 3:~220 (depends on chip vers) */ +static int pcnet32vlb; /* check for VLB cards ? */ static struct net_device *pcnet32_dev; @@ -110,32 +113,34 @@ static int rx_copybreak = 200; * to internal options */ static const unsigned char options_mapping[] = { - PCNET32_PORT_ASEL, /* 0 Auto-select */ - PCNET32_PORT_AUI, /* 1 BNC/AUI */ - PCNET32_PORT_AUI, /* 2 AUI/BNC */ - PCNET32_PORT_ASEL, /* 3 not supported */ - PCNET32_PORT_10BT | PCNET32_PORT_FD, /* 4 10baseT-FD */ - PCNET32_PORT_ASEL, /* 5 not supported */ - PCNET32_PORT_ASEL, /* 6 not supported */ - PCNET32_PORT_ASEL, /* 7 not supported */ - PCNET32_PORT_ASEL, /* 8 not supported */ - PCNET32_PORT_MII, /* 9 MII 10baseT */ - PCNET32_PORT_MII | PCNET32_PORT_FD, /* 10 MII 10baseT-FD */ - PCNET32_PORT_MII, /* 11 MII (autosel) */ - PCNET32_PORT_10BT, /* 12 10BaseT */ - PCNET32_PORT_MII | PCNET32_PORT_100, /* 13 MII 100BaseTx */ - PCNET32_PORT_MII | PCNET32_PORT_100 | PCNET32_PORT_FD, /* 14 MII 100BaseTx-FD */ - PCNET32_PORT_ASEL /* 15 not supported */ + PCNET32_PORT_ASEL, /* 0 Auto-select */ + PCNET32_PORT_AUI, /* 1 BNC/AUI */ + PCNET32_PORT_AUI, /* 2 AUI/BNC */ + PCNET32_PORT_ASEL, /* 3 not supported */ + PCNET32_PORT_10BT | PCNET32_PORT_FD, /* 4 10baseT-FD */ + PCNET32_PORT_ASEL, /* 5 not supported */ + PCNET32_PORT_ASEL, /* 6 not supported */ + PCNET32_PORT_ASEL, /* 7 not supported */ + PCNET32_PORT_ASEL, /* 8 not supported */ + PCNET32_PORT_MII, /* 9 MII 10baseT */ + PCNET32_PORT_MII | PCNET32_PORT_FD, /* 10 MII 10baseT-FD */ + PCNET32_PORT_MII, /* 11 MII (autosel) */ + PCNET32_PORT_10BT, /* 12 10BaseT */ + PCNET32_PORT_MII | PCNET32_PORT_100, /* 13 MII 100BaseTx */ + /* 14 MII 100BaseTx-FD */ + PCNET32_PORT_MII | PCNET32_PORT_100 | PCNET32_PORT_FD, + PCNET32_PORT_ASEL /* 15 not supported */ }; static const char pcnet32_gstrings_test[][ETH_GSTRING_LEN] = { - "Loopback test (offline)" + "Loopback test (offline)" }; + #define PCNET32_TEST_LEN (sizeof(pcnet32_gstrings_test) / ETH_GSTRING_LEN) -#define PCNET32_NUM_REGS 168 +#define PCNET32_NUM_REGS 136 -#define MAX_UNITS 8 /* More are supported, limit only on options */ +#define MAX_UNITS 8 /* More are supported, limit only on options */ static int options[MAX_UNITS]; static int full_duplex[MAX_UNITS]; static int homepna[MAX_UNITS]; @@ -151,124 +156,6 @@ static int homepna[MAX_UNITS]; */ /* - * History: - * v0.01: Initial version - * only tested on Alpha Noname Board - * v0.02: changed IRQ handling for new interrupt scheme (dev_id) - * tested on a ASUS SP3G - * v0.10: fixed an odd problem with the 79C974 in a Compaq Deskpro XL - * looks like the 974 doesn't like stopping and restarting in a - * short period of time; now we do a reinit of the lance; the - * bug was triggered by doing ifconfig eth0 <ip> broadcast <addr> - * and hangs the machine (thanks to Klaus Liedl for debugging) - * v0.12: by suggestion from Donald Becker: Renamed driver to pcnet32, - * made it standalone (no need for lance.c) - * v0.13: added additional PCI detecting for special PCI devices (Compaq) - * v0.14: stripped down additional PCI probe (thanks to David C Niemi - * and sveneric@xs4all.nl for testing this on their Compaq boxes) - * v0.15: added 79C965 (VLB) probe - * added interrupt sharing for PCI chips - * v0.16: fixed set_multicast_list on Alpha machines - * v0.17: removed hack from dev.c; now pcnet32 uses ethif_probe in Space.c - * v0.19: changed setting of autoselect bit - * v0.20: removed additional Compaq PCI probe; there is now a working one - * in arch/i386/bios32.c - * v0.21: added endian conversion for ppc, from work by cort@cs.nmt.edu - * v0.22: added printing of status to ring dump - * v0.23: changed enet_statistics to net_devive_stats - * v0.90: added multicast filter - * added module support - * changed irq probe to new style - * added PCnetFast chip id - * added fix for receive stalls with Intel saturn chipsets - * added in-place rx skbs like in the tulip driver - * minor cleanups - * v0.91: added PCnetFast+ chip id - * back port to 2.0.x - * v1.00: added some stuff from Donald Becker's 2.0.34 version - * added support for byte counters in net_dev_stats - * v1.01: do ring dumps, only when debugging the driver - * increased the transmit timeout - * v1.02: fixed memory leak in pcnet32_init_ring() - * v1.10: workaround for stopped transmitter - * added port selection for modules - * detect special T1/E1 WAN card and setup port selection - * v1.11: fixed wrong checking of Tx errors - * v1.20: added check of return value kmalloc (cpeterso@cs.washington.edu) - * added save original kmalloc addr for freeing (mcr@solidum.com) - * added support for PCnetHome chip (joe@MIT.EDU) - * rewritten PCI card detection - * added dwio mode to get driver working on some PPC machines - * v1.21: added mii selection and mii ioctl - * v1.22: changed pci scanning code to make PPC people happy - * fixed switching to 32bit mode in pcnet32_open() (thanks - * to Michael Richard <mcr@solidum.com> for noticing this one) - * added sub vendor/device id matching (thanks again to - * Michael Richard <mcr@solidum.com>) - * added chip id for 79c973/975 (thanks to Zach Brown <zab@zabbo.net>) - * v1.23 fixed small bug, when manual selecting MII speed/duplex - * v1.24 Applied Thomas' patch to use TxStartPoint and thus decrease TxFIFO - * underflows. Added tx_start_pt module parameter. Increased - * TX_RING_SIZE from 16 to 32. Added #ifdef'd code to use DXSUFLO - * for FAST[+] chipsets. <kaf@fc.hp.com> - * v1.24ac Added SMP spinlocking - Alan Cox <alan@redhat.com> - * v1.25kf Added No Interrupt on successful Tx for some Tx's <kaf@fc.hp.com> - * v1.26 Converted to pci_alloc_consistent, Jamey Hicks / George France - * <jamey@crl.dec.com> - * - Fixed a few bugs, related to running the controller in 32bit mode. - * 23 Oct, 2000. Carsten Langgaard, carstenl@mips.com - * Copyright (C) 2000 MIPS Technologies, Inc. All rights reserved. - * v1.26p Fix oops on rmmod+insmod; plug i/o resource leak - Paul Gortmaker - * v1.27 improved CSR/PROM address detection, lots of cleanups, - * new pcnet32vlb module option, HP-PARISC support, - * added module parameter descriptions, - * initial ethtool support - Helge Deller <deller@gmx.de> - * v1.27a Sun Feb 10 2002 Go Taniguchi <go@turbolinux.co.jp> - * use alloc_etherdev and register_netdev - * fix pci probe not increment cards_found - * FD auto negotiate error workaround for xSeries250 - * clean up and using new mii module - * v1.27b Sep 30 2002 Kent Yoder <yoder1@us.ibm.com> - * Added timer for cable connection state changes. - * v1.28 20 Feb 2004 Don Fry <brazilnut@us.ibm.com> - * Jon Mason <jonmason@us.ibm.com>, Chinmay Albal <albal@in.ibm.com> - * Now uses ethtool_ops, netif_msg_* and generic_mii_ioctl. - * Fixes bogus 'Bus master arbitration failure', pci_[un]map_single - * length errors, and transmit hangs. Cleans up after errors in open. - * Jim Lewis <jklewis@us.ibm.com> added ethernet loopback test. - * Thomas Munck Steenholdt <tmus@tmus.dk> non-mii ioctl corrections. - * v1.29 6 Apr 2004 Jim Lewis <jklewis@us.ibm.com> added physical - * identification code (blink led's) and register dump. - * Don Fry added timer for 971/972 so skbufs don't remain on tx ring - * forever. - * v1.30 18 May 2004 Don Fry removed timer and Last Transmit Interrupt - * (ltint) as they added complexity and didn't give good throughput. - * v1.30a 22 May 2004 Don Fry limit frames received during interrupt. - * v1.30b 24 May 2004 Don Fry fix bogus tx carrier errors with 79c973, - * assisted by Bruce Penrod <bmpenrod@endruntechnologies.com>. - * v1.30c 25 May 2004 Don Fry added netif_wake_queue after pcnet32_restart. - * v1.30d 01 Jun 2004 Don Fry discard oversize rx packets. - * v1.30e 11 Jun 2004 Don Fry recover after fifo error and rx hang. - * v1.30f 16 Jun 2004 Don Fry cleanup IRQ to allow 0 and 1 for PCI, - * expanding on suggestions from Ralf Baechle <ralf@linux-mips.org>, - * and Brian Murphy <brian@murphy.dk>. - * v1.30g 22 Jun 2004 Patrick Simmons <psimmons@flash.net> added option - * homepna for selecting HomePNA mode for PCNet/Home 79C978. - * v1.30h 24 Jun 2004 Don Fry correctly select auto, speed, duplex in bcr32. - * v1.30i 28 Jun 2004 Don Fry change to use module_param. - * v1.30j 29 Apr 2005 Don Fry fix skb/map leak with loopback test. - * v1.31 02 Sep 2005 Hubert WS Lin <wslin@tw.ibm.c0m> added set_ringparam(). - * v1.31a 12 Sep 2005 Hubert WS Lin <wslin@tw.ibm.c0m> set min ring size to 4 - * to allow loopback test to work unchanged. - * v1.31b 06 Oct 2005 Don Fry changed alloc_ring to show name of device - * if allocation fails - * v1.31c 01 Nov 2005 Don Fry Allied Telesyn 2700/2701 FX are 100Mbit only. - * Force 100Mbit FD if Auto (ASEL) is selected. - * See Bugzilla 2669 and 4551. - */ - - -/* * Set the number of Tx and Rx buffers, using Log_2(# buffers). * Reasonable default values are 4 Tx buffers, and 16 Rx buffers. * That translates to 2 (4 == 2^^2) and 4 (16 == 2^^4). @@ -303,42 +190,42 @@ static int homepna[MAX_UNITS]; /* The PCNET32 Rx and Tx ring descriptors. */ struct pcnet32_rx_head { - u32 base; - s16 buf_length; - s16 status; - u32 msg_length; - u32 reserved; + u32 base; + s16 buf_length; + s16 status; + u32 msg_length; + u32 reserved; }; struct pcnet32_tx_head { - u32 base; - s16 length; - s16 status; - u32 misc; - u32 reserved; + u32 base; + s16 length; + s16 status; + u32 misc; + u32 reserved; }; /* The PCNET32 32-Bit initialization block, described in databook. */ struct pcnet32_init_block { - u16 mode; - u16 tlen_rlen; - u8 phys_addr[6]; - u16 reserved; - u32 filter[2]; - /* Receive and transmit ring base, along with extra bits. */ - u32 rx_ring; - u32 tx_ring; + u16 mode; + u16 tlen_rlen; + u8 phys_addr[6]; + u16 reserved; + u32 filter[2]; + /* Receive and transmit ring base, along with extra bits. */ + u32 rx_ring; + u32 tx_ring; }; /* PCnet32 access functions */ struct pcnet32_access { - u16 (*read_csr)(unsigned long, int); - void (*write_csr)(unsigned long, int, u16); - u16 (*read_bcr)(unsigned long, int); - void (*write_bcr)(unsigned long, int, u16); - u16 (*read_rap)(unsigned long); - void (*write_rap)(unsigned long, u16); - void (*reset)(unsigned long); + u16 (*read_csr) (unsigned long, int); + void (*write_csr) (unsigned long, int, u16); + u16 (*read_bcr) (unsigned long, int); + void (*write_bcr) (unsigned long, int, u16); + u16 (*read_rap) (unsigned long); + void (*write_rap) (unsigned long, u16); + void (*reset) (unsigned long); }; /* @@ -346,760 +233,794 @@ struct pcnet32_access { * so the structure should be allocated using pci_alloc_consistent(). */ struct pcnet32_private { - struct pcnet32_init_block init_block; - /* The Tx and Rx ring entries must be aligned on 16-byte boundaries in 32bit mode. */ - struct pcnet32_rx_head *rx_ring; - struct pcnet32_tx_head *tx_ring; - dma_addr_t dma_addr; /* DMA address of beginning of this - object, returned by - pci_alloc_consistent */ - struct pci_dev *pci_dev; /* Pointer to the associated pci device - structure */ - const char *name; - /* The saved address of a sent-in-place packet/buffer, for skfree(). */ - struct sk_buff **tx_skbuff; - struct sk_buff **rx_skbuff; - dma_addr_t *tx_dma_addr; - dma_addr_t *rx_dma_addr; - struct pcnet32_access a; - spinlock_t lock; /* Guard lock */ - unsigned int cur_rx, cur_tx; /* The next free ring entry */ - unsigned int rx_ring_size; /* current rx ring size */ - unsigned int tx_ring_size; /* current tx ring size */ - unsigned int rx_mod_mask; /* rx ring modular mask */ - unsigned int tx_mod_mask; /* tx ring modular mask */ - unsigned short rx_len_bits; - unsigned short tx_len_bits; - dma_addr_t rx_ring_dma_addr; - dma_addr_t tx_ring_dma_addr; - unsigned int dirty_rx, dirty_tx; /* The ring entries to be free()ed. */ - struct net_device_stats stats; - char tx_full; - int options; - unsigned int shared_irq:1, /* shared irq possible */ - dxsuflo:1, /* disable transmit stop on uflo */ - mii:1; /* mii port available */ - struct net_device *next; - struct mii_if_info mii_if; - struct timer_list watchdog_timer; - struct timer_list blink_timer; - u32 msg_enable; /* debug message level */ + struct pcnet32_init_block init_block; + /* The Tx and Rx ring entries must be aligned on 16-byte boundaries in 32bit mode. */ + struct pcnet32_rx_head *rx_ring; + struct pcnet32_tx_head *tx_ring; + dma_addr_t dma_addr;/* DMA address of beginning of this + object, returned by pci_alloc_consistent */ + struct pci_dev *pci_dev; + const char *name; + /* The saved address of a sent-in-place packet/buffer, for skfree(). */ + struct sk_buff **tx_skbuff; + struct sk_buff **rx_skbuff; + dma_addr_t *tx_dma_addr; + dma_addr_t *rx_dma_addr; + struct pcnet32_access a; + spinlock_t lock; /* Guard lock */ + unsigned int cur_rx, cur_tx; /* The next free ring entry */ + unsigned int rx_ring_size; /* current rx ring size */ + unsigned int tx_ring_size; /* current tx ring size */ + unsigned int rx_mod_mask; /* rx ring modular mask */ + unsigned int tx_mod_mask; /* tx ring modular mask */ + unsigned short rx_len_bits; + unsigned short tx_len_bits; + dma_addr_t rx_ring_dma_addr; + dma_addr_t tx_ring_dma_addr; + unsigned int dirty_rx, /* ring entries to be freed. */ + dirty_tx; + + struct net_device_stats stats; + char tx_full; + char phycount; /* number of phys found */ + int options; + unsigned int shared_irq:1, /* shared irq possible */ + dxsuflo:1, /* disable transmit stop on uflo */ + mii:1; /* mii port available */ + struct net_device *next; + struct mii_if_info mii_if; + struct timer_list watchdog_timer; + struct timer_list blink_timer; + u32 msg_enable; /* debug message level */ + + /* each bit indicates an available PHY */ + u32 phymask; }; static void pcnet32_probe_vlbus(void); -static int pcnet32_probe_pci(struct pci_dev *, const struct pci_device_id *); -static int pcnet32_probe1(unsigned long, int, struct pci_dev *); -static int pcnet32_open(struct net_device *); -static int pcnet32_init_ring(struct net_device *); -static int pcnet32_start_xmit(struct sk_buff *, struct net_device *); -static int pcnet32_rx(struct net_device *); -static void pcnet32_tx_timeout (struct net_device *dev); +static int pcnet32_probe_pci(struct pci_dev *, const struct pci_device_id *); +static int pcnet32_probe1(unsigned long, int, struct pci_dev *); +static int pcnet32_open(struct net_device *); +static int pcnet32_init_ring(struct net_device *); +static int pcnet32_start_xmit(struct sk_buff *, struct net_device *); +static int pcnet32_rx(struct net_device *); +static void pcnet32_tx_timeout(struct net_device *dev); static irqreturn_t pcnet32_interrupt(int, void *, struct pt_regs *); -static int pcnet32_close(struct net_device *); +static int pcnet32_close(struct net_device *); static struct net_device_stats *pcnet32_get_stats(struct net_device *); static void pcnet32_load_multicast(struct net_device *dev); static void pcnet32_set_multicast_list(struct net_device *); -static int pcnet32_ioctl(struct net_device *, struct ifreq *, int); +static int pcnet32_ioctl(struct net_device *, struct ifreq *, int); static void pcnet32_watchdog(struct net_device *); static int mdio_read(struct net_device *dev, int phy_id, int reg_num); -static void mdio_write(struct net_device *dev, int phy_id, int reg_num, int val); +static void mdio_write(struct net_device *dev, int phy_id, int reg_num, + int val); static void pcnet32_restart(struct net_device *dev, unsigned int csr0_bits); static void pcnet32_ethtool_test(struct net_device *dev, - struct ethtool_test *eth_test, u64 *data); -static int pcnet32_loopback_test(struct net_device *dev, uint64_t *data1); + struct ethtool_test *eth_test, u64 * data); +static int pcnet32_loopback_test(struct net_device *dev, uint64_t * data1); static int pcnet32_phys_id(struct net_device *dev, u32 data); static void pcnet32_led_blink_callback(struct net_device *dev); static int pcnet32_get_regs_len(struct net_device *dev); static void pcnet32_get_regs(struct net_device *dev, struct ethtool_regs *regs, - void *ptr); + void *ptr); static void pcnet32_purge_tx_ring(struct net_device *dev); static int pcnet32_alloc_ring(struct net_device *dev, char *name); static void pcnet32_free_ring(struct net_device *dev); - +static void pcnet32_check_media(struct net_device *dev, int verbose); enum pci_flags_bit { - PCI_USES_IO=1, PCI_USES_MEM=2, PCI_USES_MASTER=4, - PCI_ADDR0=0x10<<0, PCI_ADDR1=0x10<<1, PCI_ADDR2=0x10<<2, PCI_ADDR3=0x10<<3, + PCI_USES_IO = 1, PCI_USES_MEM = 2, PCI_USES_MASTER = 4, + PCI_ADDR0 = 0x10 << 0, PCI_ADDR1 = 0x10 << 1, PCI_ADDR2 = + 0x10 << 2, PCI_ADDR3 = 0x10 << 3, }; - -static u16 pcnet32_wio_read_csr (unsigned long addr, int index) +static u16 pcnet32_wio_read_csr(unsigned long addr, int index) { - outw (index, addr+PCNET32_WIO_RAP); - return inw (addr+PCNET32_WIO_RDP); + outw(index, addr + PCNET32_WIO_RAP); + return inw(addr + PCNET32_WIO_RDP); } -static void pcnet32_wio_write_csr (unsigned long addr, int index, u16 val) +static void pcnet32_wio_write_csr(unsigned long addr, int index, u16 val) { - outw (index, addr+PCNET32_WIO_RAP); - outw (val, addr+PCNET32_WIO_RDP); + outw(index, addr + PCNET32_WIO_RAP); + outw(val, addr + PCNET32_WIO_RDP); } -static u16 pcnet32_wio_read_bcr (unsigned long addr, int index) +static u16 pcnet32_wio_read_bcr(unsigned long addr, int index) { - outw (index, addr+PCNET32_WIO_RAP); - return inw (addr+PCNET32_WIO_BDP); + outw(index, addr + PCNET32_WIO_RAP); + return inw(addr + PCNET32_WIO_BDP); } -static void pcnet32_wio_write_bcr (unsigned long addr, int index, u16 val) +static void pcnet32_wio_write_bcr(unsigned long addr, int index, u16 val) { - outw (index, addr+PCNET32_WIO_RAP); - outw (val, addr+PCNET32_WIO_BDP); + outw(index, addr + PCNET32_WIO_RAP); + outw(val, addr + PCNET32_WIO_BDP); } -static u16 pcnet32_wio_read_rap (unsigned long addr) +static u16 pcnet32_wio_read_rap(unsigned long addr) { - return inw (addr+PCNET32_WIO_RAP); + return inw(addr + PCNET32_WIO_RAP); } -static void pcnet32_wio_write_rap (unsigned long addr, u16 val) +static void pcnet32_wio_write_rap(unsigned long addr, u16 val) { - outw (val, addr+PCNET32_WIO_RAP); + outw(val, addr + PCNET32_WIO_RAP); } -static void pcnet32_wio_reset (unsigned long addr) +static void pcnet32_wio_reset(unsigned long addr) { - inw (addr+PCNET32_WIO_RESET); + inw(addr + PCNET32_WIO_RESET); } -static int pcnet32_wio_check (unsigned long addr) +static int pcnet32_wio_check(unsigned long addr) { - outw (88, addr+PCNET32_WIO_RAP); - return (inw (addr+PCNET32_WIO_RAP) == 88); + outw(88, addr + PCNET32_WIO_RAP); + return (inw(addr + PCNET32_WIO_RAP) == 88); } static struct pcnet32_access pcnet32_wio = { - .read_csr = pcnet32_wio_read_csr, - .write_csr = pcnet32_wio_write_csr, - .read_bcr = pcnet32_wio_read_bcr, - .write_bcr = pcnet32_wio_write_bcr, - .read_rap = pcnet32_wio_read_rap, - .write_rap = pcnet32_wio_write_rap, - .reset = pcnet32_wio_reset + .read_csr = pcnet32_wio_read_csr, + .write_csr = pcnet32_wio_write_csr, + .read_bcr = pcnet32_wio_read_bcr, + .write_bcr = pcnet32_wio_write_bcr, + .read_rap = pcnet32_wio_read_rap, + .write_rap = pcnet32_wio_write_rap, + .reset = pcnet32_wio_reset }; -static u16 pcnet32_dwio_read_csr (unsigned long addr, int index) +static u16 pcnet32_dwio_read_csr(unsigned long addr, int index) { - outl (index, addr+PCNET32_DWIO_RAP); - return (inl (addr+PCNET32_DWIO_RDP) & 0xffff); + outl(index, addr + PCNET32_DWIO_RAP); + return (inl(addr + PCNET32_DWIO_RDP) & 0xffff); } -static void pcnet32_dwio_write_csr (unsigned long addr, int index, u16 val) +static void pcnet32_dwio_write_csr(unsigned long addr, int index, u16 val) { - outl (index, addr+PCNET32_DWIO_RAP); - outl (val, addr+PCNET32_DWIO_RDP); + outl(index, addr + PCNET32_DWIO_RAP); + outl(val, addr + PCNET32_DWIO_RDP); } -static u16 pcnet32_dwio_read_bcr (unsigned long addr, int index) +static u16 pcnet32_dwio_read_bcr(unsigned long addr, int index) { - outl (index, addr+PCNET32_DWIO_RAP); - return (inl (addr+PCNET32_DWIO_BDP) & 0xffff); + outl(index, addr + PCNET32_DWIO_RAP); + return (inl(addr + PCNET32_DWIO_BDP) & 0xffff); } -static void pcnet32_dwio_write_bcr (unsigned long addr, int index, u16 val) +static void pcnet32_dwio_write_bcr(unsigned long addr, int index, u16 val) { - outl (index, addr+PCNET32_DWIO_RAP); - outl (val, addr+PCNET32_DWIO_BDP); + outl(index, addr + PCNET32_DWIO_RAP); + outl(val, addr + PCNET32_DWIO_BDP); } -static u16 pcnet32_dwio_read_rap (unsigned long addr) +static u16 pcnet32_dwio_read_rap(unsigned long addr) { - return (inl (addr+PCNET32_DWIO_RAP) & 0xffff); + return (inl(addr + PCNET32_DWIO_RAP) & 0xffff); } -static void pcnet32_dwio_write_rap (unsigned long addr, u16 val) +static void pcnet32_dwio_write_rap(unsigned long addr, u16 val) { - outl (val, addr+PCNET32_DWIO_RAP); + outl(val, addr + PCNET32_DWIO_RAP); } -static void pcnet32_dwio_reset (unsigned long addr) +static void pcnet32_dwio_reset(unsigned long addr) { - inl (addr+PCNET32_DWIO_RESET); + inl(addr + PCNET32_DWIO_RESET); } -static int pcnet32_dwio_check (unsigned long addr) +static int pcnet32_dwio_check(unsigned long addr) { - outl (88, addr+PCNET32_DWIO_RAP); - return ((inl (addr+PCNET32_DWIO_RAP) & 0xffff) == 88); + outl(88, addr + PCNET32_DWIO_RAP); + return ((inl(addr + PCNET32_DWIO_RAP) & 0xffff) == 88); } static struct pcnet32_access pcnet32_dwio = { - .read_csr = pcnet32_dwio_read_csr, - .write_csr = pcnet32_dwio_write_csr, - .read_bcr = pcnet32_dwio_read_bcr, - .write_bcr = pcnet32_dwio_write_bcr, - .read_rap = pcnet32_dwio_read_rap, - .write_rap = pcnet32_dwio_write_rap, - .reset = pcnet32_dwio_reset + .read_csr = pcnet32_dwio_read_csr, + .write_csr = pcnet32_dwio_write_csr, + .read_bcr = pcnet32_dwio_read_bcr, + .write_bcr = pcnet32_dwio_write_bcr, + .read_rap = pcnet32_dwio_read_rap, + .write_rap = pcnet32_dwio_write_rap, + .reset = pcnet32_dwio_reset }; #ifdef CONFIG_NET_POLL_CONTROLLER static void pcnet32_poll_controller(struct net_device *dev) { - disable_irq(dev->irq); - pcnet32_interrupt(0, dev, NULL); - enable_irq(dev->irq); + disable_irq(dev->irq); + pcnet32_interrupt(0, dev, NULL); + enable_irq(dev->irq); } #endif - static int pcnet32_get_settings(struct net_device *dev, struct ethtool_cmd *cmd) { - struct pcnet32_private *lp = dev->priv; - unsigned long flags; - int r = -EOPNOTSUPP; - - if (lp->mii) { - spin_lock_irqsave(&lp->lock, flags); - mii_ethtool_gset(&lp->mii_if, cmd); - spin_unlock_irqrestore(&lp->lock, flags); - r = 0; - } - return r; + struct pcnet32_private *lp = dev->priv; + unsigned long flags; + int r = -EOPNOTSUPP; + + if (lp->mii) { + spin_lock_irqsave(&lp->lock, flags); + mii_ethtool_gset(&lp->mii_if, cmd); + spin_unlock_irqrestore(&lp->lock, flags); + r = 0; + } + return r; } static int pcnet32_set_settings(struct net_device *dev, struct ethtool_cmd *cmd) { - struct pcnet32_private *lp = dev->priv; - unsigned long flags; - int r = -EOPNOTSUPP; + struct pcnet32_private *lp = dev->priv; + unsigned long flags; + int r = -EOPNOTSUPP; - if (lp->mii) { - spin_lock_irqsave(&lp->lock, flags); - r = mii_ethtool_sset(&lp->mii_if, cmd); - spin_unlock_irqrestore(&lp->lock, flags); - } - return r; + if (lp->mii) { + spin_lock_irqsave(&lp->lock, flags); + r = mii_ethtool_sset(&lp->mii_if, cmd); + spin_unlock_irqrestore(&lp->lock, flags); + } + return r; } -static void pcnet32_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info) +static void pcnet32_get_drvinfo(struct net_device *dev, + struct ethtool_drvinfo *info) { - struct pcnet32_private *lp = dev->priv; - - strcpy (info->driver, DRV_NAME); - strcpy (info->version, DRV_VERSION); - if (lp->pci_dev) - strcpy (info->bus_info, pci_name(lp->pci_dev)); - else - sprintf(info->bus_info, "VLB 0x%lx", dev->base_addr); + struct pcnet32_private *lp = dev->priv; + + strcpy(info->driver, DRV_NAME); + strcpy(info->version, DRV_VERSION); + if (lp->pci_dev) + strcpy(info->bus_info, pci_name(lp->pci_dev)); + else + sprintf(info->bus_info, "VLB 0x%lx", dev->base_addr); } static u32 pcnet32_get_link(struct net_device *dev) { - struct pcnet32_private *lp = dev->priv; - unsigned long flags; - int r; - - spin_lock_irqsave(&lp->lock, flags); - if (lp->mii) { - r = mii_link_ok(&lp->mii_if); - } else { - ulong ioaddr = dev->base_addr; /* card base I/O address */ - r = (lp->a.read_bcr(ioaddr, 4) != 0xc0); - } - spin_unlock_irqrestore(&lp->lock, flags); + struct pcnet32_private *lp = dev->priv; + unsigned long flags; + int r; - return r; + spin_lock_irqsave(&lp->lock, flags); + if (lp->mii) { + r = mii_link_ok(&lp->mii_if); + } else { + ulong ioaddr = dev->base_addr; /* card base I/O address */ + r = (lp->a.read_bcr(ioaddr, 4) != 0xc0); + } + spin_unlock_irqrestore(&lp->lock, flags); + + return r; } static u32 pcnet32_get_msglevel(struct net_device *dev) { - struct pcnet32_private *lp = dev->priv; - return lp->msg_enable; + struct pcnet32_private *lp = dev->priv; + return lp->msg_enable; } static void pcnet32_set_msglevel(struct net_device *dev, u32 value) { - struct pcnet32_private *lp = dev->priv; - lp->msg_enable = value; + struct pcnet32_private *lp = dev->priv; + lp->msg_enable = value; } static int pcnet32_nway_reset(struct net_device *dev) { - struct pcnet32_private *lp = dev->priv; - unsigned long flags; - int r = -EOPNOTSUPP; + struct pcnet32_private *lp = dev->priv; + unsigned long flags; + int r = -EOPNOTSUPP; - if (lp->mii) { - spin_lock_irqsave(&lp->lock, flags); - r = mii_nway_restart(&lp->mii_if); - spin_unlock_irqrestore(&lp->lock, flags); - } - return r; + if (lp->mii) { + spin_lock_irqsave(&lp->lock, flags); + r = mii_nway_restart(&lp->mii_if); + spin_unlock_irqrestore(&lp->lock, flags); + } + return r; } -static void pcnet32_get_ringparam(struct net_device *dev, struct ethtool_ringparam *ering) +static void pcnet32_get_ringparam(struct net_device *dev, + struct ethtool_ringparam *ering) { - struct pcnet32_private *lp = dev->priv; + struct pcnet32_private *lp = dev->priv; - ering->tx_max_pending = TX_MAX_RING_SIZE - 1; - ering->tx_pending = lp->tx_ring_size - 1; - ering->rx_max_pending = RX_MAX_RING_SIZE - 1; - ering->rx_pending = lp->rx_ring_size - 1; + ering->tx_max_pending = TX_MAX_RING_SIZE - 1; + ering->tx_pending = lp->tx_ring_size - 1; + ering->rx_max_pending = RX_MAX_RING_SIZE - 1; + ering->rx_pending = lp->rx_ring_size - 1; } -static int pcnet32_set_ringparam(struct net_device *dev, struct ethtool_ringparam *ering) +static int pcnet32_set_ringparam(struct net_device *dev, + struct ethtool_ringparam *ering) { - struct pcnet32_private *lp = dev->priv; - unsigned long flags; - int i; - - if (ering->rx_mini_pending || ering->rx_jumbo_pending) - return -EINVAL; - - if (netif_running(dev)) - pcnet32_close(dev); - - spin_lock_irqsave(&lp->lock, flags); - pcnet32_free_ring(dev); - lp->tx_ring_size = min(ering->tx_pending, (unsigned int) TX_MAX_RING_SIZE); - lp->rx_ring_size = min(ering->rx_pending, (unsigned int) RX_MAX_RING_SIZE); - - /* set the minimum ring size to 4, to allow the loopback test to work - * unchanged. - */ - for (i = 2; i <= PCNET32_LOG_MAX_TX_BUFFERS; i++) { - if (lp->tx_ring_size <= (1 << i)) - break; - } - lp->tx_ring_size = (1 << i); - lp->tx_mod_mask = lp->tx_ring_size - 1; - lp->tx_len_bits = (i << 12); - - for (i = 2; i <= PCNET32_LOG_MAX_RX_BUFFERS; i++) { - if (lp->rx_ring_size <= (1 << i)) - break; - } - lp->rx_ring_size = (1 << i); - lp->rx_mod_mask = lp->rx_ring_size - 1; - lp->rx_len_bits = (i << 4); - - if (pcnet32_alloc_ring(dev, dev->name)) { + struct pcnet32_private *lp = dev->priv; + unsigned long flags; + int i; + + if (ering->rx_mini_pending || ering->rx_jumbo_pending) + return -EINVAL; + + if (netif_running(dev)) + pcnet32_close(dev); + + spin_lock_irqsave(&lp->lock, flags); pcnet32_free_ring(dev); - spin_unlock_irqrestore(&lp->lock, flags); - return -ENOMEM; - } + lp->tx_ring_size = + min(ering->tx_pending, (unsigned int)TX_MAX_RING_SIZE); + lp->rx_ring_size = + min(ering->rx_pending, (unsigned int)RX_MAX_RING_SIZE); + + /* set the minimum ring size to 4, to allow the loopback test to work + * unchanged. + */ + for (i = 2; i <= PCNET32_LOG_MAX_TX_BUFFERS; i++) { + if (lp->tx_ring_size <= (1 << i)) + break; + } + lp->tx_ring_size = (1 << i); + lp->tx_mod_mask = lp->tx_ring_size - 1; + lp->tx_len_bits = (i << 12); - spin_unlock_irqrestore(&lp->lock, flags); + for (i = 2; i <= PCNET32_LOG_MAX_RX_BUFFERS; i++) { + if (lp->rx_ring_size <= (1 << i)) + break; + } + lp->rx_ring_size = (1 << i); + lp->rx_mod_mask = lp->rx_ring_size - 1; + lp->rx_len_bits = (i << 4); + + if (pcnet32_alloc_ring(dev, dev->name)) { + pcnet32_free_ring(dev); + spin_unlock_irqrestore(&lp->lock, flags); + return -ENOMEM; + } - if (pcnet32_debug & NETIF_MSG_DRV) - printk(KERN_INFO PFX "%s: Ring Param Settings: RX: %d, TX: %d\n", - dev->name, lp->rx_ring_size, lp->tx_ring_size); + spin_unlock_irqrestore(&lp->lock, flags); - if (netif_running(dev)) - pcnet32_open(dev); + if (pcnet32_debug & NETIF_MSG_DRV) + printk(KERN_INFO PFX + "%s: Ring Param Settings: RX: %d, TX: %d\n", dev->name, + lp->rx_ring_size, lp->tx_ring_size); - return 0; + if (netif_running(dev)) + pcnet32_open(dev); + + return 0; } -static void pcnet32_get_strings(struct net_device *dev, u32 stringset, u8 *data) +static void pcnet32_get_strings(struct net_device *dev, u32 stringset, + u8 * data) { - memcpy(data, pcnet32_gstrings_test, sizeof(pcnet32_gstrings_test)); + memcpy(data, pcnet32_gstrings_test, sizeof(pcnet32_gstrings_test)); } static int pcnet32_self_test_count(struct net_device *dev) { - return PCNET32_TEST_LEN; + return PCNET32_TEST_LEN; } static void pcnet32_ethtool_test(struct net_device *dev, - struct ethtool_test *test, u64 *data) + struct ethtool_test *test, u64 * data) { - struct pcnet32_private *lp = dev->priv; - int rc; - - if (test->flags == ETH_TEST_FL_OFFLINE) { - rc = pcnet32_loopback_test(dev, data); - if (rc) { - if (netif_msg_hw(lp)) - printk(KERN_DEBUG "%s: Loopback test failed.\n", dev->name); - test->flags |= ETH_TEST_FL_FAILED; + struct pcnet32_private *lp = dev->priv; + int rc; + + if (test->flags == ETH_TEST_FL_OFFLINE) { + rc = pcnet32_loopback_test(dev, data); + if (rc) { + if (netif_msg_hw(lp)) + printk(KERN_DEBUG "%s: Loopback test failed.\n", + dev->name); + test->flags |= ETH_TEST_FL_FAILED; + } else if (netif_msg_hw(lp)) + printk(KERN_DEBUG "%s: Loopback test passed.\n", + dev->name); } else if (netif_msg_hw(lp)) - printk(KERN_DEBUG "%s: Loopback test passed.\n", dev->name); - } else if (netif_msg_hw(lp)) - printk(KERN_DEBUG "%s: No tests to run (specify 'Offline' on ethtool).", dev->name); -} /* end pcnet32_ethtool_test */ + printk(KERN_DEBUG + "%s: No tests to run (specify 'Offline' on ethtool).", + dev->name); +} /* end pcnet32_ethtool_test */ -static int pcnet32_loopback_test(struct net_device *dev, uint64_t *data1) +static int pcnet32_loopback_test(struct net_device *dev, uint64_t * data1) { - struct pcnet32_private *lp = dev->priv; - struct pcnet32_access *a = &lp->a; /* access to registers */ - ulong ioaddr = dev->base_addr; /* card base I/O address */ - struct sk_buff *skb; /* sk buff */ - int x, i; /* counters */ - int numbuffs = 4; /* number of TX/RX buffers and descs */ - u16 status = 0x8300; /* TX ring status */ - u16 teststatus; /* test of ring status */ - int rc; /* return code */ - int size; /* size of packets */ - unsigned char *packet; /* source packet data */ - static const int data_len = 60; /* length of source packets */ - unsigned long flags; - unsigned long ticks; - - *data1 = 1; /* status of test, default to fail */ - rc = 1; /* default to fail */ - - if (netif_running(dev)) - pcnet32_close(dev); - - spin_lock_irqsave(&lp->lock, flags); - - /* Reset the PCNET32 */ - lp->a.reset (ioaddr); - - /* switch pcnet32 to 32bit mode */ - lp->a.write_bcr (ioaddr, 20, 2); - - lp->init_block.mode = le16_to_cpu((lp->options & PCNET32_PORT_PORTSEL) << 7); - lp->init_block.filter[0] = 0; - lp->init_block.filter[1] = 0; - - /* purge & init rings but don't actually restart */ - pcnet32_restart(dev, 0x0000); - - lp->a.write_csr(ioaddr, 0, 0x0004); /* Set STOP bit */ - - /* Initialize Transmit buffers. */ - size = data_len + 15; - for (x=0; x<numbuffs; x++) { - if (!(skb = dev_alloc_skb(size))) { - if (netif_msg_hw(lp)) - printk(KERN_DEBUG "%s: Cannot allocate skb at line: %d!\n", - dev->name, __LINE__); - goto clean_up; - } else { - packet = skb->data; - skb_put(skb, size); /* create space for data */ - lp->tx_skbuff[x] = skb; - lp->tx_ring[x].length = le16_to_cpu(-skb->len); - lp->tx_ring[x].misc = 0; - - /* put DA and SA into the skb */ - for (i=0; i<6; i++) - *packet++ = dev->dev_addr[i]; - for (i=0; i<6; i++) - *packet++ = dev->dev_addr[i]; - /* type */ - *packet++ = 0x08; - *packet++ = 0x06; - /* packet number */ - *packet++ = x; - /* fill packet with data */ - for (i=0; i<data_len; i++) - *packet++ = i; - - lp->tx_dma_addr[x] = pci_map_single(lp->pci_dev, skb->data, - skb->len, PCI_DMA_TODEVICE); - lp->tx_ring[x].base = (u32)le32_to_cpu(lp->tx_dma_addr[x]); - wmb(); /* Make sure owner changes after all others are visible */ - lp->tx_ring[x].status = le16_to_cpu(status); - } - } - - x = a->read_bcr(ioaddr, 32); /* set internal loopback in BSR32 */ - x = x | 0x0002; - a->write_bcr(ioaddr, 32, x); - - lp->a.write_csr (ioaddr, 15, 0x0044); /* set int loopback in CSR15 */ - - teststatus = le16_to_cpu(0x8000); - lp->a.write_csr(ioaddr, 0, 0x0002); /* Set STRT bit */ - - /* Check status of descriptors */ - for (x=0; x<numbuffs; x++) { - ticks = 0; - rmb(); - while ((lp->rx_ring[x].status & teststatus) && (ticks < 200)) { - spin_unlock_irqrestore(&lp->lock, flags); - mdelay(1); - spin_lock_irqsave(&lp->lock, flags); - rmb(); - ticks++; - } - if (ticks == 200) { - if (netif_msg_hw(lp)) - printk("%s: Desc %d failed to reset!\n",dev->name,x); - break; - } - } - - lp->a.write_csr(ioaddr, 0, 0x0004); /* Set STOP bit */ - wmb(); - if (netif_msg_hw(lp) && netif_msg_pktdata(lp)) { - printk(KERN_DEBUG "%s: RX loopback packets:\n", dev->name); - - for (x=0; x<numbuffs; x++) { - printk(KERN_DEBUG "%s: Packet %d:\n", dev->name, x); - skb = lp->rx_skbuff[x]; - for (i=0; i<size; i++) { - printk("%02x ", *(skb->data+i)); - } - printk("\n"); - } - } - - x = 0; - rc = 0; - while (x<numbuffs && !rc) { - skb = lp->rx_skbuff[x]; - packet = lp->tx_skbuff[x]->data; - for (i=0; i<size; i++) { - if (*(skb->data+i) != packet[i]) { - if (netif_msg_hw(lp)) - printk(KERN_DEBUG "%s: Error in compare! %2x - %02x %02x\n", - dev->name, i, *(skb->data+i), packet[i]); - rc = 1; - break; - } + struct pcnet32_private *lp = dev->priv; + struct pcnet32_access *a = &lp->a; /* access to registers */ + ulong ioaddr = dev->base_addr; /* card base I/O address */ + struct sk_buff *skb; /* sk buff */ + int x, i; /* counters */ + int numbuffs = 4; /* number of TX/RX buffers and descs */ + u16 status = 0x8300; /* TX ring status */ + u16 teststatus; /* test of ring status */ + int rc; /* return code */ + int size; /* size of packets */ + unsigned char *packet; /* source packet data */ + static const int data_len = 60; /* length of source packets */ + unsigned long flags; + unsigned long ticks; + + *data1 = 1; /* status of test, default to fail */ + rc = 1; /* default to fail */ + + if (netif_running(dev)) + pcnet32_close(dev); + + spin_lock_irqsave(&lp->lock, flags); + + /* Reset the PCNET32 */ + lp->a.reset(ioaddr); + + /* switch pcnet32 to 32bit mode */ + lp->a.write_bcr(ioaddr, 20, 2); + + lp->init_block.mode = + le16_to_cpu((lp->options & PCNET32_PORT_PORTSEL) << 7); + lp->init_block.filter[0] = 0; + lp->init_block.filter[1] = 0; + + /* purge & init rings but don't actually restart */ + pcnet32_restart(dev, 0x0000); + + lp->a.write_csr(ioaddr, 0, 0x0004); /* Set STOP bit */ + + /* Initialize Transmit buffers. */ + size = data_len + 15; + for (x = 0; x < numbuffs; x++) { + if (!(skb = dev_alloc_skb(size))) { + if (netif_msg_hw(lp)) + printk(KERN_DEBUG + "%s: Cannot allocate skb at line: %d!\n", + dev->name, __LINE__); + goto clean_up; + } else { + packet = skb->data; + skb_put(skb, size); /* create space for data */ + lp->tx_skbuff[x] = skb; + lp->tx_ring[x].length = le16_to_cpu(-skb->len); + lp->tx_ring[x].misc = 0; + + /* put DA and SA into the skb */ + for (i = 0; i < 6; i++) + *packet++ = dev->dev_addr[i]; + for (i = 0; i < 6; i++) + *packet++ = dev->dev_addr[i]; + /* type */ + *packet++ = 0x08; + *packet++ = 0x06; + /* packet number */ + *packet++ = x; + /* fill packet with data */ + for (i = 0; i < data_len; i++) + *packet++ = i; + + lp->tx_dma_addr[x] = + pci_map_single(lp->pci_dev, skb->data, skb->len, + PCI_DMA_TODEVICE); + lp->tx_ring[x].base = + (u32) le32_to_cpu(lp->tx_dma_addr[x]); + wmb(); /* Make sure owner changes after all others are visible */ + lp->tx_ring[x].status = le16_to_cpu(status); + } + } + + x = a->read_bcr(ioaddr, 32); /* set internal loopback in BSR32 */ + x = x | 0x0002; + a->write_bcr(ioaddr, 32, x); + + lp->a.write_csr(ioaddr, 15, 0x0044); /* set int loopback in CSR15 */ + + teststatus = le16_to_cpu(0x8000); + lp->a.write_csr(ioaddr, 0, 0x0002); /* Set STRT bit */ + + /* Check status of descriptors */ + for (x = 0; x < numbuffs; x++) { + ticks = 0; + rmb(); + while ((lp->rx_ring[x].status & teststatus) && (ticks < 200)) { + spin_unlock_irqrestore(&lp->lock, flags); + mdelay(1); + spin_lock_irqsave(&lp->lock, flags); + rmb(); + ticks++; + } + if (ticks == 200) { + if (netif_msg_hw(lp)) + printk("%s: Desc %d failed to reset!\n", + dev->name, x); + break; + } + } + + lp->a.write_csr(ioaddr, 0, 0x0004); /* Set STOP bit */ + wmb(); + if (netif_msg_hw(lp) && netif_msg_pktdata(lp)) { + printk(KERN_DEBUG "%s: RX loopback packets:\n", dev->name); + + for (x = 0; x < numbuffs; x++) { + printk(KERN_DEBUG "%s: Packet %d:\n", dev->name, x); + skb = lp->rx_skbuff[x]; + for (i = 0; i < size; i++) { + printk("%02x ", *(skb->data + i)); + } + printk("\n"); + } + } + + x = 0; + rc = 0; + while (x < numbuffs && !rc) { + skb = lp->rx_skbuff[x]; + packet = lp->tx_skbuff[x]->data; + for (i = 0; i < size; i++) { + if (*(skb->data + i) != packet[i]) { + if (netif_msg_hw(lp)) + printk(KERN_DEBUG + "%s: Error in compare! %2x - %02x %02x\n", + dev->name, i, *(skb->data + i), + packet[i]); + rc = 1; + break; + } + } + x++; + } + if (!rc) { + *data1 = 0; } - x++; - } - if (!rc) { - *data1 = 0; - } -clean_up: - pcnet32_purge_tx_ring(dev); - x = a->read_csr(ioaddr, 15) & 0xFFFF; - a->write_csr(ioaddr, 15, (x & ~0x0044)); /* reset bits 6 and 2 */ + clean_up: + pcnet32_purge_tx_ring(dev); + x = a->read_csr(ioaddr, 15) & 0xFFFF; + a->write_csr(ioaddr, 15, (x & ~0x0044)); /* reset bits 6 and 2 */ - x = a->read_bcr(ioaddr, 32); /* reset internal loopback */ - x = x & ~0x0002; - a->write_bcr(ioaddr, 32, x); + x = a->read_bcr(ioaddr, 32); /* reset internal loopback */ + x = x & ~0x0002; + a->write_bcr(ioaddr, 32, x); - spin_unlock_irqrestore(&lp->lock, flags); + spin_unlock_irqrestore(&lp->lock, flags); - if (netif_running(dev)) { - pcnet32_open(dev); - } else { - lp->a.write_bcr (ioaddr, 20, 4); /* return to 16bit mode */ - } + if (netif_running(dev)) { + pcnet32_open(dev); + } else { + lp->a.write_bcr(ioaddr, 20, 4); /* return to 16bit mode */ + } - return(rc); -} /* end pcnet32_loopback_test */ + return (rc); +} /* end pcnet32_loopback_test */ static void pcnet32_led_blink_callback(struct net_device *dev) { - struct pcnet32_private *lp = dev->priv; - struct pcnet32_access *a = &lp->a; - ulong ioaddr = dev->base_addr; - unsigned long flags; - int i; - - spin_lock_irqsave(&lp->lock, flags); - for (i=4; i<8; i++) { - a->write_bcr(ioaddr, i, a->read_bcr(ioaddr, i) ^ 0x4000); - } - spin_unlock_irqrestore(&lp->lock, flags); - - mod_timer(&lp->blink_timer, PCNET32_BLINK_TIMEOUT); + struct pcnet32_private *lp = dev->priv; + struct pcnet32_access *a = &lp->a; + ulong ioaddr = dev->base_addr; + unsigned long flags; + int i; + + spin_lock_irqsave(&lp->lock, flags); + for (i = 4; i < 8; i++) { + a->write_bcr(ioaddr, i, a->read_bcr(ioaddr, i) ^ 0x4000); + } + spin_unlock_irqrestore(&lp->lock, flags); + + mod_timer(&lp->blink_timer, PCNET32_BLINK_TIMEOUT); } static int pcnet32_phys_id(struct net_device *dev, u32 data) { - struct pcnet32_private *lp = dev->priv; - struct pcnet32_access *a = &lp->a; - ulong ioaddr = dev->base_addr; - unsigned long flags; - int i, regs[4]; - - if (!lp->blink_timer.function) { - init_timer(&lp->blink_timer); - lp->blink_timer.function = (void *) pcnet32_led_blink_callback; - lp->blink_timer.data = (unsigned long) dev; - } - - /* Save the current value of the bcrs */ - spin_lock_irqsave(&lp->lock, flags); - for (i=4; i<8; i++) { - regs[i-4] = a->read_bcr(ioaddr, i); - } - spin_unlock_irqrestore(&lp->lock, flags); - - mod_timer(&lp->blink_timer, jiffies); - set_current_state(TASK_INTERRUPTIBLE); - - if ((!data) || (data > (u32)(MAX_SCHEDULE_TIMEOUT / HZ))) - data = (u32)(MAX_SCHEDULE_TIMEOUT / HZ); - - msleep_interruptible(data * 1000); - del_timer_sync(&lp->blink_timer); - - /* Restore the original value of the bcrs */ - spin_lock_irqsave(&lp->lock, flags); - for (i=4; i<8; i++) { - a->write_bcr(ioaddr, i, regs[i-4]); - } - spin_unlock_irqrestore(&lp->lock, flags); - - return 0; + struct pcnet32_private *lp = dev->priv; + struct pcnet32_access *a = &lp->a; + ulong ioaddr = dev->base_addr; + unsigned long flags; + int i, regs[4]; + + if (!lp->blink_timer.function) { + init_timer(&lp->blink_timer); + lp->blink_timer.function = (void *)pcnet32_led_blink_callback; + lp->blink_timer.data = (unsigned long)dev; + } + + /* Save the current value of the bcrs */ + spin_lock_irqsave(&lp->lock, flags); + for (i = 4; i < 8; i++) { + regs[i - 4] = a->read_bcr(ioaddr, i); + } + spin_unlock_irqrestore(&lp->lock, flags); + + mod_timer(&lp->blink_timer, jiffies); + set_current_state(TASK_INTERRUPTIBLE); + + if ((!data) || (data > (u32) (MAX_SCHEDULE_TIMEOUT / HZ))) + data = (u32) (MAX_SCHEDULE_TIMEOUT / HZ); + + msleep_interruptible(data * 1000); + del_timer_sync(&lp->blink_timer); + + /* Restore the original value of the bcrs */ + spin_lock_irqsave(&lp->lock, flags); + for (i = 4; i < 8; i++) { + a->write_bcr(ioaddr, i, regs[i - 4]); + } + spin_unlock_irqrestore(&lp->lock, flags); + + return 0; } +#define PCNET32_REGS_PER_PHY 32 +#define PCNET32_MAX_PHYS 32 static int pcnet32_get_regs_len(struct net_device *dev) { - return(PCNET32_NUM_REGS * sizeof(u16)); + struct pcnet32_private *lp = dev->priv; + int j = lp->phycount * PCNET32_REGS_PER_PHY; + + return ((PCNET32_NUM_REGS + j) * sizeof(u16)); } static void pcnet32_get_regs(struct net_device *dev, struct ethtool_regs *regs, - void *ptr) + void *ptr) { - int i, csr0; - u16 *buff = ptr; - struct pcnet32_private *lp = dev->priv; - struct pcnet32_access *a = &lp->a; - ulong ioaddr = dev->base_addr; - int ticks; - unsigned long flags; - - spin_lock_irqsave(&lp->lock, flags); - - csr0 = a->read_csr(ioaddr, 0); - if (!(csr0 & 0x0004)) { /* If not stopped */ - /* set SUSPEND (SPND) - CSR5 bit 0 */ - a->write_csr(ioaddr, 5, 0x0001); - - /* poll waiting for bit to be set */ - ticks = 0; - while (!(a->read_csr(ioaddr, 5) & 0x0001)) { - spin_unlock_irqrestore(&lp->lock, flags); - mdelay(1); - spin_lock_irqsave(&lp->lock, flags); - ticks++; - if (ticks > 200) { - if (netif_msg_hw(lp)) - printk(KERN_DEBUG "%s: Error getting into suspend!\n", - dev->name); - break; - } - } - } + int i, csr0; + u16 *buff = ptr; + struct pcnet32_private *lp = dev->priv; + struct pcnet32_access *a = &lp->a; + ulong ioaddr = dev->base_addr; + int ticks; + unsigned long flags; - /* read address PROM */ - for (i=0; i<16; i += 2) - *buff++ = inw(ioaddr + i); + spin_lock_irqsave(&lp->lock, flags); - /* read control and status registers */ - for (i=0; i<90; i++) { - *buff++ = a->read_csr(ioaddr, i); - } + csr0 = a->read_csr(ioaddr, 0); + if (!(csr0 & 0x0004)) { /* If not stopped */ + /* set SUSPEND (SPND) - CSR5 bit 0 */ + a->write_csr(ioaddr, 5, 0x0001); + + /* poll waiting for bit to be set */ + ticks = 0; + while (!(a->read_csr(ioaddr, 5) & 0x0001)) { + spin_unlock_irqrestore(&lp->lock, flags); + mdelay(1); + spin_lock_irqsave(&lp->lock, flags); + ticks++; + if (ticks > 200) { + if (netif_msg_hw(lp)) + printk(KERN_DEBUG + "%s: Error getting into suspend!\n", + dev->name); + break; + } + } + } - *buff++ = a->read_csr(ioaddr, 112); - *buff++ = a->read_csr(ioaddr, 114); + /* read address PROM */ + for (i = 0; i < 16; i += 2) + *buff++ = inw(ioaddr + i); - /* read bus configuration registers */ - for (i=0; i<30; i++) { - *buff++ = a->read_bcr(ioaddr, i); - } - *buff++ = 0; /* skip bcr30 so as not to hang 79C976 */ - for (i=31; i<36; i++) { - *buff++ = a->read_bcr(ioaddr, i); - } + /* read control and status registers */ + for (i = 0; i < 90; i++) { + *buff++ = a->read_csr(ioaddr, i); + } + + *buff++ = a->read_csr(ioaddr, 112); + *buff++ = a->read_csr(ioaddr, 114); - /* read mii phy registers */ - if (lp->mii) { - for (i=0; i<32; i++) { - lp->a.write_bcr(ioaddr, 33, ((lp->mii_if.phy_id) << 5) | i); - *buff++ = lp->a.read_bcr(ioaddr, 34); + /* read bus configuration registers */ + for (i = 0; i < 30; i++) { + *buff++ = a->read_bcr(ioaddr, i); + } + *buff++ = 0; /* skip bcr30 so as not to hang 79C976 */ + for (i = 31; i < 36; i++) { + *buff++ = a->read_bcr(ioaddr, i); } - } - if (!(csr0 & 0x0004)) { /* If not stopped */ - /* clear SUSPEND (SPND) - CSR5 bit 0 */ - a->write_csr(ioaddr, 5, 0x0000); - } + /* read mii phy registers */ + if (lp->mii) { + int j; + for (j = 0; j < PCNET32_MAX_PHYS; j++) { + if (lp->phymask & (1 << j)) { + for (i = 0; i < PCNET32_REGS_PER_PHY; i++) { + lp->a.write_bcr(ioaddr, 33, + (j << 5) | i); + *buff++ = lp->a.read_bcr(ioaddr, 34); + } + } + } + } - i = buff - (u16 *)ptr; - for (; i < PCNET32_NUM_REGS; i++) - *buff++ = 0; + if (!(csr0 & 0x0004)) { /* If not stopped */ + /* clear SUSPEND (SPND) - CSR5 bit 0 */ + a->write_csr(ioaddr, 5, 0x0000); + } - spin_unlock_irqrestore(&lp->lock, flags); + spin_unlock_irqrestore(&lp->lock, flags); } static struct ethtool_ops pcnet32_ethtool_ops = { - .get_settings = pcnet32_get_settings, - .set_settings = pcnet32_set_settings, - .get_drvinfo = pcnet32_get_drvinfo, - .get_msglevel = pcnet32_get_msglevel, - .set_msglevel = pcnet32_set_msglevel, - .nway_reset = pcnet32_nway_reset, - .get_link = pcnet32_get_link, - .get_ringparam = pcnet32_get_ringparam, - .set_ringparam = pcnet32_set_ringparam, - .get_tx_csum = ethtool_op_get_tx_csum, - .get_sg = ethtool_op_get_sg, - .get_tso = ethtool_op_get_tso, - .get_strings = pcnet32_get_strings, - .self_test_count = pcnet32_self_test_count, - .self_test = pcnet32_ethtool_test, - .phys_id = pcnet32_phys_id, - .get_regs_len = pcnet32_get_regs_len, - .get_regs = pcnet32_get_regs, - .get_perm_addr = ethtool_op_get_perm_addr, + .get_settings = pcnet32_get_settings, + .set_settings = pcnet32_set_settings, + .get_drvinfo = pcnet32_get_drvinfo, + .get_msglevel = pcnet32_get_msglevel, + .set_msglevel = pcnet32_set_msglevel, + .nway_reset = pcnet32_nway_reset, + .get_link = pcnet32_get_link, + .get_ringparam = pcnet32_get_ringparam, + .set_ringparam = pcnet32_set_ringparam, + .get_tx_csum = ethtool_op_get_tx_csum, + .get_sg = ethtool_op_get_sg, + .get_tso = ethtool_op_get_tso, + .get_strings = pcnet32_get_strings, + .self_test_count = pcnet32_self_test_count, + .self_test = pcnet32_ethtool_test, + .phys_id = pcnet32_phys_id, + .get_regs_len = pcnet32_get_regs_len, + .get_regs = pcnet32_get_regs, + .get_perm_addr = ethtool_op_get_perm_addr, }; /* only probes for non-PCI devices, the rest are handled by * pci_register_driver via pcnet32_probe_pci */ -static void __devinit -pcnet32_probe_vlbus(void) +static void __devinit pcnet32_probe_vlbus(void) { - unsigned int *port, ioaddr; - - /* search for PCnet32 VLB cards at known addresses */ - for (port = pcnet32_portlist; (ioaddr = *port); port++) { - if (request_region(ioaddr, PCNET32_TOTAL_SIZE, "pcnet32_probe_vlbus")) { - /* check if there is really a pcnet chip on that ioaddr */ - if ((inb(ioaddr + 14) == 0x57) && (inb(ioaddr + 15) == 0x57)) { - pcnet32_probe1(ioaddr, 0, NULL); - } else { - release_region(ioaddr, PCNET32_TOTAL_SIZE); - } - } - } + unsigned int *port, ioaddr; + + /* search for PCnet32 VLB cards at known addresses */ + for (port = pcnet32_portlist; (ioaddr = *port); port++) { + if (request_region + (ioaddr, PCNET32_TOTAL_SIZE, "pcnet32_probe_vlbus")) { + /* check if there is really a pcnet chip on that ioaddr */ + if ((inb(ioaddr + 14) == 0x57) + && (inb(ioaddr + 15) == 0x57)) { + pcnet32_probe1(ioaddr, 0, NULL); + } else { + release_region(ioaddr, PCNET32_TOTAL_SIZE); + } + } + } } - static int __devinit pcnet32_probe_pci(struct pci_dev *pdev, const struct pci_device_id *ent) { - unsigned long ioaddr; - int err; - - err = pci_enable_device(pdev); - if (err < 0) { - if (pcnet32_debug & NETIF_MSG_PROBE) - printk(KERN_ERR PFX "failed to enable device -- err=%d\n", err); - return err; - } - pci_set_master(pdev); + unsigned long ioaddr; + int err; + + err = pci_enable_device(pdev); + if (err < 0) { + if (pcnet32_debug & NETIF_MSG_PROBE) + printk(KERN_ERR PFX + "failed to enable device -- err=%d\n", err); + return err; + } + pci_set_master(pdev); + + ioaddr = pci_resource_start(pdev, 0); + if (!ioaddr) { + if (pcnet32_debug & NETIF_MSG_PROBE) + printk(KERN_ERR PFX + "card has no PCI IO resources, aborting\n"); + return -ENODEV; + } - ioaddr = pci_resource_start (pdev, 0); - if (!ioaddr) { - if (pcnet32_debug & NETIF_MSG_PROBE) - printk (KERN_ERR PFX "card has no PCI IO resources, aborting\n"); - return -ENODEV; - } + if (!pci_dma_supported(pdev, PCNET32_DMA_MASK)) { + if (pcnet32_debug & NETIF_MSG_PROBE) + printk(KERN_ERR PFX + "architecture does not support 32bit PCI busmaster DMA\n"); + return -ENODEV; + } + if (request_region(ioaddr, PCNET32_TOTAL_SIZE, "pcnet32_probe_pci") == + NULL) { + if (pcnet32_debug & NETIF_MSG_PROBE) + printk(KERN_ERR PFX + "io address range already allocated\n"); + return -EBUSY; + } - if (!pci_dma_supported(pdev, PCNET32_DMA_MASK)) { - if (pcnet32_debug & NETIF_MSG_PROBE) - printk(KERN_ERR PFX "architecture does not support 32bit PCI busmaster DMA\n"); - return -ENODEV; - } - if (request_region(ioaddr, PCNET32_TOTAL_SIZE, "pcnet32_probe_pci") == NULL) { - if (pcnet32_debug & NETIF_MSG_PROBE) - printk(KERN_ERR PFX "io address range already allocated\n"); - return -EBUSY; - } - - err = pcnet32_probe1(ioaddr, 1, pdev); - if (err < 0) { - pci_disable_device(pdev); - } - return err; + err = pcnet32_probe1(ioaddr, 1, pdev); + if (err < 0) { + pci_disable_device(pdev); + } + return err; } - /* pcnet32_probe1 * Called from both pcnet32_probe_vlbus and pcnet_probe_pci. * pdev will be NULL when called from pcnet32_probe_vlbus. @@ -1107,630 +1028,764 @@ pcnet32_probe_pci(struct pci_dev *pdev, const struct pci_device_id *ent) static int __devinit pcnet32_probe1(unsigned long ioaddr, int shared, struct pci_dev *pdev) { - struct pcnet32_private *lp; - dma_addr_t lp_dma_addr; - int i, media; - int fdx, mii, fset, dxsuflo; - int chip_version; - char *chipname; - struct net_device *dev; - struct pcnet32_access *a = NULL; - u8 promaddr[6]; - int ret = -ENODEV; - - /* reset the chip */ - pcnet32_wio_reset(ioaddr); - - /* NOTE: 16-bit check is first, otherwise some older PCnet chips fail */ - if (pcnet32_wio_read_csr(ioaddr, 0) == 4 && pcnet32_wio_check(ioaddr)) { - a = &pcnet32_wio; - } else { - pcnet32_dwio_reset(ioaddr); - if (pcnet32_dwio_read_csr(ioaddr, 0) == 4 && pcnet32_dwio_check(ioaddr)) { - a = &pcnet32_dwio; - } else - goto err_release_region; - } - - chip_version = a->read_csr(ioaddr, 88) | (a->read_csr(ioaddr,89) << 16); - if ((pcnet32_debug & NETIF_MSG_PROBE) && (pcnet32_debug & NETIF_MSG_HW)) - printk(KERN_INFO " PCnet chip version is %#x.\n", chip_version); - if ((chip_version & 0xfff) != 0x003) { - if (pcnet32_debug & NETIF_MSG_PROBE) - printk(KERN_INFO PFX "Unsupported chip version.\n"); - goto err_release_region; - } - - /* initialize variables */ - fdx = mii = fset = dxsuflo = 0; - chip_version = (chip_version >> 12) & 0xffff; - - switch (chip_version) { - case 0x2420: - chipname = "PCnet/PCI 79C970"; /* PCI */ - break; - case 0x2430: - if (shared) - chipname = "PCnet/PCI 79C970"; /* 970 gives the wrong chip id back */ - else - chipname = "PCnet/32 79C965"; /* 486/VL bus */ - break; - case 0x2621: - chipname = "PCnet/PCI II 79C970A"; /* PCI */ - fdx = 1; - break; - case 0x2623: - chipname = "PCnet/FAST 79C971"; /* PCI */ - fdx = 1; mii = 1; fset = 1; - break; - case 0x2624: - chipname = "PCnet/FAST+ 79C972"; /* PCI */ - fdx = 1; mii = 1; fset = 1; - break; - case 0x2625: - chipname = "PCnet/FAST III 79C973"; /* PCI */ - fdx = 1; mii = 1; - break; - case 0x2626: - chipname = "PCnet/Home 79C978"; /* PCI */ - fdx = 1; + struct pcnet32_private *lp; + dma_addr_t lp_dma_addr; + int i, media; + int fdx, mii, fset, dxsuflo; + int chip_version; + char *chipname; + struct net_device *dev; + struct pcnet32_access *a = NULL; + u8 promaddr[6]; + int ret = -ENODEV; + + /* reset the chip */ + pcnet32_wio_reset(ioaddr); + + /* NOTE: 16-bit check is first, otherwise some older PCnet chips fail */ + if (pcnet32_wio_read_csr(ioaddr, 0) == 4 && pcnet32_wio_check(ioaddr)) { + a = &pcnet32_wio; + } else { + pcnet32_dwio_reset(ioaddr); + if (pcnet32_dwio_read_csr(ioaddr, 0) == 4 + && pcnet32_dwio_check(ioaddr)) { + a = &pcnet32_dwio; + } else + goto err_release_region; + } + + chip_version = + a->read_csr(ioaddr, 88) | (a->read_csr(ioaddr, 89) << 16); + if ((pcnet32_debug & NETIF_MSG_PROBE) && (pcnet32_debug & NETIF_MSG_HW)) + printk(KERN_INFO " PCnet chip version is %#x.\n", + chip_version); + if ((chip_version & 0xfff) != 0x003) { + if (pcnet32_debug & NETIF_MSG_PROBE) + printk(KERN_INFO PFX "Unsupported chip version.\n"); + goto err_release_region; + } + + /* initialize variables */ + fdx = mii = fset = dxsuflo = 0; + chip_version = (chip_version >> 12) & 0xffff; + + switch (chip_version) { + case 0x2420: + chipname = "PCnet/PCI 79C970"; /* PCI */ + break; + case 0x2430: + if (shared) + chipname = "PCnet/PCI 79C970"; /* 970 gives the wrong chip id back */ + else + chipname = "PCnet/32 79C965"; /* 486/VL bus */ + break; + case 0x2621: + chipname = "PCnet/PCI II 79C970A"; /* PCI */ + fdx = 1; + break; + case 0x2623: + chipname = "PCnet/FAST 79C971"; /* PCI */ + fdx = 1; + mii = 1; + fset = 1; + break; + case 0x2624: + chipname = "PCnet/FAST+ 79C972"; /* PCI */ + fdx = 1; + mii = 1; + fset = 1; + break; + case 0x2625: + chipname = "PCnet/FAST III 79C973"; /* PCI */ + fdx = 1; + mii = 1; + break; + case 0x2626: + chipname = "PCnet/Home 79C978"; /* PCI */ + fdx = 1; + /* + * This is based on specs published at www.amd.com. This section + * assumes that a card with a 79C978 wants to go into standard + * ethernet mode. The 79C978 can also go into 1Mb HomePNA mode, + * and the module option homepna=1 can select this instead. + */ + media = a->read_bcr(ioaddr, 49); + media &= ~3; /* default to 10Mb ethernet */ + if (cards_found < MAX_UNITS && homepna[cards_found]) + media |= 1; /* switch to home wiring mode */ + if (pcnet32_debug & NETIF_MSG_PROBE) + printk(KERN_DEBUG PFX "media set to %sMbit mode.\n", + (media & 1) ? "1" : "10"); + a->write_bcr(ioaddr, 49, media); + break; + case 0x2627: + chipname = "PCnet/FAST III 79C975"; /* PCI */ + fdx = 1; + mii = 1; + break; + case 0x2628: + chipname = "PCnet/PRO 79C976"; + fdx = 1; + mii = 1; + break; + default: + if (pcnet32_debug & NETIF_MSG_PROBE) + printk(KERN_INFO PFX + "PCnet version %#x, no PCnet32 chip.\n", + chip_version); + goto err_release_region; + } + /* - * This is based on specs published at www.amd.com. This section - * assumes that a card with a 79C978 wants to go into standard - * ethernet mode. The 79C978 can also go into 1Mb HomePNA mode, - * and the module option homepna=1 can select this instead. + * On selected chips turn on the BCR18:NOUFLO bit. This stops transmit + * starting until the packet is loaded. Strike one for reliability, lose + * one for latency - although on PCI this isnt a big loss. Older chips + * have FIFO's smaller than a packet, so you can't do this. + * Turn on BCR18:BurstRdEn and BCR18:BurstWrEn. */ - media = a->read_bcr(ioaddr, 49); - media &= ~3; /* default to 10Mb ethernet */ - if (cards_found < MAX_UNITS && homepna[cards_found]) - media |= 1; /* switch to home wiring mode */ - if (pcnet32_debug & NETIF_MSG_PROBE) - printk(KERN_DEBUG PFX "media set to %sMbit mode.\n", - (media & 1) ? "1" : "10"); - a->write_bcr(ioaddr, 49, media); - break; - case 0x2627: - chipname = "PCnet/FAST III 79C975"; /* PCI */ - fdx = 1; mii = 1; - break; - case 0x2628: - chipname = "PCnet/PRO 79C976"; - fdx = 1; mii = 1; - break; - default: - if (pcnet32_debug & NETIF_MSG_PROBE) - printk(KERN_INFO PFX "PCnet version %#x, no PCnet32 chip.\n", - chip_version); - goto err_release_region; - } - - /* - * On selected chips turn on the BCR18:NOUFLO bit. This stops transmit - * starting until the packet is loaded. Strike one for reliability, lose - * one for latency - although on PCI this isnt a big loss. Older chips - * have FIFO's smaller than a packet, so you can't do this. - * Turn on BCR18:BurstRdEn and BCR18:BurstWrEn. - */ - - if (fset) { - a->write_bcr(ioaddr, 18, (a->read_bcr(ioaddr, 18) | 0x0860)); - a->write_csr(ioaddr, 80, (a->read_csr(ioaddr, 80) & 0x0C00) | 0x0c00); - dxsuflo = 1; - } - - dev = alloc_etherdev(0); - if (!dev) { + + if (fset) { + a->write_bcr(ioaddr, 18, (a->read_bcr(ioaddr, 18) | 0x0860)); + a->write_csr(ioaddr, 80, + (a->read_csr(ioaddr, 80) & 0x0C00) | 0x0c00); + dxsuflo = 1; + } + + dev = alloc_etherdev(0); + if (!dev) { + if (pcnet32_debug & NETIF_MSG_PROBE) + printk(KERN_ERR PFX "Memory allocation failed.\n"); + ret = -ENOMEM; + goto err_release_region; + } + SET_NETDEV_DEV(dev, &pdev->dev); + if (pcnet32_debug & NETIF_MSG_PROBE) - printk(KERN_ERR PFX "Memory allocation failed.\n"); - ret = -ENOMEM; - goto err_release_region; - } - SET_NETDEV_DEV(dev, &pdev->dev); - - if (pcnet32_debug & NETIF_MSG_PROBE) - printk(KERN_INFO PFX "%s at %#3lx,", chipname, ioaddr); - - /* In most chips, after a chip reset, the ethernet address is read from the - * station address PROM at the base address and programmed into the - * "Physical Address Registers" CSR12-14. - * As a precautionary measure, we read the PROM values and complain if - * they disagree with the CSRs. Either way, we use the CSR values, and - * double check that they are valid. - */ - for (i = 0; i < 3; i++) { - unsigned int val; - val = a->read_csr(ioaddr, i+12) & 0x0ffff; - /* There may be endianness issues here. */ - dev->dev_addr[2*i] = val & 0x0ff; - dev->dev_addr[2*i+1] = (val >> 8) & 0x0ff; - } - - /* read PROM address and compare with CSR address */ - for (i = 0; i < 6; i++) - promaddr[i] = inb(ioaddr + i); - - if (memcmp(promaddr, dev->dev_addr, 6) - || !is_valid_ether_addr(dev->dev_addr)) { - if (is_valid_ether_addr(promaddr)) { - if (pcnet32_debug & NETIF_MSG_PROBE) { - printk(" warning: CSR address invalid,\n"); - printk(KERN_INFO " using instead PROM address of"); - } - memcpy(dev->dev_addr, promaddr, 6); - } - } - memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len); - - /* if the ethernet address is not valid, force to 00:00:00:00:00:00 */ - if (!is_valid_ether_addr(dev->perm_addr)) - memset(dev->dev_addr, 0, sizeof(dev->dev_addr)); - - if (pcnet32_debug & NETIF_MSG_PROBE) { + printk(KERN_INFO PFX "%s at %#3lx,", chipname, ioaddr); + + /* In most chips, after a chip reset, the ethernet address is read from the + * station address PROM at the base address and programmed into the + * "Physical Address Registers" CSR12-14. + * As a precautionary measure, we read the PROM values and complain if + * they disagree with the CSRs. Either way, we use the CSR values, and + * double check that they are valid. + */ + for (i = 0; i < 3; i++) { + unsigned int val; + val = a->read_csr(ioaddr, i + 12) & 0x0ffff; + /* There may be endianness issues here. */ + dev->dev_addr[2 * i] = val & 0x0ff; + dev->dev_addr[2 * i + 1] = (val >> 8) & 0x0ff; + } + + /* read PROM address and compare with CSR address */ for (i = 0; i < 6; i++) - printk(" %2.2x", dev->dev_addr[i]); - - /* Version 0x2623 and 0x2624 */ - if (((chip_version + 1) & 0xfffe) == 0x2624) { - i = a->read_csr(ioaddr, 80) & 0x0C00; /* Check tx_start_pt */ - printk("\n" KERN_INFO " tx_start_pt(0x%04x):",i); - switch(i>>10) { - case 0: printk(" 20 bytes,"); break; - case 1: printk(" 64 bytes,"); break; - case 2: printk(" 128 bytes,"); break; - case 3: printk("~220 bytes,"); break; - } - i = a->read_bcr(ioaddr, 18); /* Check Burst/Bus control */ - printk(" BCR18(%x):",i&0xffff); - if (i & (1<<5)) printk("BurstWrEn "); - if (i & (1<<6)) printk("BurstRdEn "); - if (i & (1<<7)) printk("DWordIO "); - if (i & (1<<11)) printk("NoUFlow "); - i = a->read_bcr(ioaddr, 25); - printk("\n" KERN_INFO " SRAMSIZE=0x%04x,",i<<8); - i = a->read_bcr(ioaddr, 26); - printk(" SRAM_BND=0x%04x,",i<<8); - i = a->read_bcr(ioaddr, 27); - if (i & (1<<14)) printk("LowLatRx"); - } - } - - dev->base_addr = ioaddr; - /* pci_alloc_consistent returns page-aligned memory, so we do not have to check the alignment */ - if ((lp = pci_alloc_consistent(pdev, sizeof(*lp), &lp_dma_addr)) == NULL) { - if (pcnet32_debug & NETIF_MSG_PROBE) - printk(KERN_ERR PFX "Consistent memory allocation failed.\n"); - ret = -ENOMEM; - goto err_free_netdev; - } - - memset(lp, 0, sizeof(*lp)); - lp->dma_addr = lp_dma_addr; - lp->pci_dev = pdev; - - spin_lock_init(&lp->lock); - - SET_MODULE_OWNER(dev); - SET_NETDEV_DEV(dev, &pdev->dev); - dev->priv = lp; - lp->name = chipname; - lp->shared_irq = shared; - lp->tx_ring_size = TX_RING_SIZE; /* default tx ring size */ - lp->rx_ring_size = RX_RING_SIZE; /* default rx ring size */ - lp->tx_mod_mask = lp->tx_ring_size - 1; - lp->rx_mod_mask = lp->rx_ring_size - 1; - lp->tx_len_bits = (PCNET32_LOG_TX_BUFFERS << 12); - lp->rx_len_bits = (PCNET32_LOG_RX_BUFFERS << 4); - lp->mii_if.full_duplex = fdx; - lp->mii_if.phy_id_mask = 0x1f; - lp->mii_if.reg_num_mask = 0x1f; - lp->dxsuflo = dxsuflo; - lp->mii = mii; - lp->msg_enable = pcnet32_debug; - if ((cards_found >= MAX_UNITS) || (options[cards_found] > sizeof(options_mapping))) - lp->options = PCNET32_PORT_ASEL; - else - lp->options = options_mapping[options[cards_found]]; - lp->mii_if.dev = dev; - lp->mii_if.mdio_read = mdio_read; - lp->mii_if.mdio_write = mdio_write; - - if (fdx && !(lp->options & PCNET32_PORT_ASEL) && - ((cards_found>=MAX_UNITS) || full_duplex[cards_found])) - lp->options |= PCNET32_PORT_FD; - - if (!a) { - if (pcnet32_debug & NETIF_MSG_PROBE) - printk(KERN_ERR PFX "No access methods\n"); - ret = -ENODEV; - goto err_free_consistent; - } - lp->a = *a; - - /* prior to register_netdev, dev->name is not yet correct */ - if (pcnet32_alloc_ring(dev, pci_name(lp->pci_dev))) { - ret = -ENOMEM; - goto err_free_ring; - } - /* detect special T1/E1 WAN card by checking for MAC address */ - if (dev->dev_addr[0] == 0x00 && dev->dev_addr[1] == 0xe0 + promaddr[i] = inb(ioaddr + i); + + if (memcmp(promaddr, dev->dev_addr, 6) + || !is_valid_ether_addr(dev->dev_addr)) { + if (is_valid_ether_addr(promaddr)) { + if (pcnet32_debug & NETIF_MSG_PROBE) { + printk(" warning: CSR address invalid,\n"); + printk(KERN_INFO + " using instead PROM address of"); + } + memcpy(dev->dev_addr, promaddr, 6); + } + } + memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len); + + /* if the ethernet address is not valid, force to 00:00:00:00:00:00 */ + if (!is_valid_ether_addr(dev->perm_addr)) + memset(dev->dev_addr, 0, sizeof(dev->dev_addr)); + + if (pcnet32_debug & NETIF_MSG_PROBE) { + for (i = 0; i < 6; i++) + printk(" %2.2x", dev->dev_addr[i]); + + /* Version 0x2623 and 0x2624 */ + if (((chip_version + 1) & 0xfffe) == 0x2624) { + i = a->read_csr(ioaddr, 80) & 0x0C00; /* Check tx_start_pt */ + printk("\n" KERN_INFO " tx_start_pt(0x%04x):", i); + switch (i >> 10) { + case 0: + printk(" 20 bytes,"); + break; + case 1: + printk(" 64 bytes,"); + break; + case 2: + printk(" 128 bytes,"); + break; + case 3: + printk("~220 bytes,"); + break; + } + i = a->read_bcr(ioaddr, 18); /* Check Burst/Bus control */ + printk(" BCR18(%x):", i & 0xffff); + if (i & (1 << 5)) + printk("BurstWrEn "); + if (i & (1 << 6)) + printk("BurstRdEn "); + if (i & (1 << 7)) + printk("DWordIO "); + if (i & (1 << 11)) + printk("NoUFlow "); + i = a->read_bcr(ioaddr, 25); + printk("\n" KERN_INFO " SRAMSIZE=0x%04x,", i << 8); + i = a->read_bcr(ioaddr, 26); + printk(" SRAM_BND=0x%04x,", i << 8); + i = a->read_bcr(ioaddr, 27); + if (i & (1 << 14)) + printk("LowLatRx"); + } + } + + dev->base_addr = ioaddr; + /* pci_alloc_consistent returns page-aligned memory, so we do not have to check the alignment */ + if ((lp = + pci_alloc_consistent(pdev, sizeof(*lp), &lp_dma_addr)) == NULL) { + if (pcnet32_debug & NETIF_MSG_PROBE) + printk(KERN_ERR PFX + "Consistent memory allocation failed.\n"); + ret = -ENOMEM; + goto err_free_netdev; + } + + memset(lp, 0, sizeof(*lp)); + lp->dma_addr = lp_dma_addr; + lp->pci_dev = pdev; + + spin_lock_init(&lp->lock); + + SET_MODULE_OWNER(dev); + SET_NETDEV_DEV(dev, &pdev->dev); + dev->priv = lp; + lp->name = chipname; + lp->shared_irq = shared; + lp->tx_ring_size = TX_RING_SIZE; /* default tx ring size */ + lp->rx_ring_size = RX_RING_SIZE; /* default rx ring size */ + lp->tx_mod_mask = lp->tx_ring_size - 1; + lp->rx_mod_mask = lp->rx_ring_size - 1; + lp->tx_len_bits = (PCNET32_LOG_TX_BUFFERS << 12); + lp->rx_len_bits = (PCNET32_LOG_RX_BUFFERS << 4); + lp->mii_if.full_duplex = fdx; + lp->mii_if.phy_id_mask = 0x1f; + lp->mii_if.reg_num_mask = 0x1f; + lp->dxsuflo = dxsuflo; + lp->mii = mii; + lp->msg_enable = pcnet32_debug; + if ((cards_found >= MAX_UNITS) + || (options[cards_found] > sizeof(options_mapping))) + lp->options = PCNET32_PORT_ASEL; + else + lp->options = options_mapping[options[cards_found]]; + lp->mii_if.dev = dev; + lp->mii_if.mdio_read = mdio_read; + lp->mii_if.mdio_write = mdio_write; + + if (fdx && !(lp->options & PCNET32_PORT_ASEL) && + ((cards_found >= MAX_UNITS) || full_duplex[cards_found])) + lp->options |= PCNET32_PORT_FD; + + if (!a) { + if (pcnet32_debug & NETIF_MSG_PROBE) + printk(KERN_ERR PFX "No access methods\n"); + ret = -ENODEV; + goto err_free_consistent; + } + lp->a = *a; + + /* prior to register_netdev, dev->name is not yet correct */ + if (pcnet32_alloc_ring(dev, pci_name(lp->pci_dev))) { + ret = -ENOMEM; + goto err_free_ring; + } + /* detect special T1/E1 WAN card by checking for MAC address */ + if (dev->dev_addr[0] == 0x00 && dev->dev_addr[1] == 0xe0 && dev->dev_addr[2] == 0x75) - lp->options = PCNET32_PORT_FD | PCNET32_PORT_GPSI; - - lp->init_block.mode = le16_to_cpu(0x0003); /* Disable Rx and Tx. */ - lp->init_block.tlen_rlen = le16_to_cpu(lp->tx_len_bits | lp->rx_len_bits); - for (i = 0; i < 6; i++) - lp->init_block.phys_addr[i] = dev->dev_addr[i]; - lp->init_block.filter[0] = 0x00000000; - lp->init_block.filter[1] = 0x00000000; - lp->init_block.rx_ring = (u32)le32_to_cpu(lp->rx_ring_dma_addr); - lp->init_block.tx_ring = (u32)le32_to_cpu(lp->tx_ring_dma_addr); - - /* switch pcnet32 to 32bit mode */ - a->write_bcr(ioaddr, 20, 2); - - a->write_csr(ioaddr, 1, (lp->dma_addr + offsetof(struct pcnet32_private, - init_block)) & 0xffff); - a->write_csr(ioaddr, 2, (lp->dma_addr + offsetof(struct pcnet32_private, - init_block)) >> 16); - - if (pdev) { /* use the IRQ provided by PCI */ - dev->irq = pdev->irq; - if (pcnet32_debug & NETIF_MSG_PROBE) - printk(" assigned IRQ %d.\n", dev->irq); - } else { - unsigned long irq_mask = probe_irq_on(); + lp->options = PCNET32_PORT_FD | PCNET32_PORT_GPSI; - /* - * To auto-IRQ we enable the initialization-done and DMA error - * interrupts. For ISA boards we get a DMA error, but VLB and PCI - * boards will work. - */ - /* Trigger an initialization just for the interrupt. */ - a->write_csr (ioaddr, 0, 0x41); - mdelay (1); + lp->init_block.mode = le16_to_cpu(0x0003); /* Disable Rx and Tx. */ + lp->init_block.tlen_rlen = + le16_to_cpu(lp->tx_len_bits | lp->rx_len_bits); + for (i = 0; i < 6; i++) + lp->init_block.phys_addr[i] = dev->dev_addr[i]; + lp->init_block.filter[0] = 0x00000000; + lp->init_block.filter[1] = 0x00000000; + lp->init_block.rx_ring = (u32) le32_to_cpu(lp->rx_ring_dma_addr); + lp->init_block.tx_ring = (u32) le32_to_cpu(lp->tx_ring_dma_addr); + + /* switch pcnet32 to 32bit mode */ + a->write_bcr(ioaddr, 20, 2); + + a->write_csr(ioaddr, 1, (lp->dma_addr + offsetof(struct pcnet32_private, + init_block)) & 0xffff); + a->write_csr(ioaddr, 2, (lp->dma_addr + offsetof(struct pcnet32_private, + init_block)) >> 16); + + if (pdev) { /* use the IRQ provided by PCI */ + dev->irq = pdev->irq; + if (pcnet32_debug & NETIF_MSG_PROBE) + printk(" assigned IRQ %d.\n", dev->irq); + } else { + unsigned long irq_mask = probe_irq_on(); + + /* + * To auto-IRQ we enable the initialization-done and DMA error + * interrupts. For ISA boards we get a DMA error, but VLB and PCI + * boards will work. + */ + /* Trigger an initialization just for the interrupt. */ + a->write_csr(ioaddr, 0, 0x41); + mdelay(1); + + dev->irq = probe_irq_off(irq_mask); + if (!dev->irq) { + if (pcnet32_debug & NETIF_MSG_PROBE) + printk(", failed to detect IRQ line.\n"); + ret = -ENODEV; + goto err_free_ring; + } + if (pcnet32_debug & NETIF_MSG_PROBE) + printk(", probed IRQ %d.\n", dev->irq); + } - dev->irq = probe_irq_off (irq_mask); - if (!dev->irq) { - if (pcnet32_debug & NETIF_MSG_PROBE) - printk(", failed to detect IRQ line.\n"); - ret = -ENODEV; - goto err_free_ring; + /* Set the mii phy_id so that we can query the link state */ + if (lp->mii) { + /* lp->phycount and lp->phymask are set to 0 by memset above */ + + lp->mii_if.phy_id = ((lp->a.read_bcr(ioaddr, 33)) >> 5) & 0x1f; + /* scan for PHYs */ + for (i = 0; i < PCNET32_MAX_PHYS; i++) { + unsigned short id1, id2; + + id1 = mdio_read(dev, i, MII_PHYSID1); + if (id1 == 0xffff) + continue; + id2 = mdio_read(dev, i, MII_PHYSID2); + if (id2 == 0xffff) + continue; + if (i == 31 && ((chip_version + 1) & 0xfffe) == 0x2624) + continue; /* 79C971 & 79C972 have phantom phy at id 31 */ + lp->phycount++; + lp->phymask |= (1 << i); + lp->mii_if.phy_id = i; + if (pcnet32_debug & NETIF_MSG_PROBE) + printk(KERN_INFO PFX + "Found PHY %04x:%04x at address %d.\n", + id1, id2, i); + } + lp->a.write_bcr(ioaddr, 33, (lp->mii_if.phy_id) << 5); + if (lp->phycount > 1) { + lp->options |= PCNET32_PORT_MII; + } } - if (pcnet32_debug & NETIF_MSG_PROBE) - printk(", probed IRQ %d.\n", dev->irq); - } - - /* Set the mii phy_id so that we can query the link state */ - if (lp->mii) - lp->mii_if.phy_id = ((lp->a.read_bcr (ioaddr, 33)) >> 5) & 0x1f; - - init_timer (&lp->watchdog_timer); - lp->watchdog_timer.data = (unsigned long) dev; - lp->watchdog_timer.function = (void *) &pcnet32_watchdog; - - /* The PCNET32-specific entries in the device structure. */ - dev->open = &pcnet32_open; - dev->hard_start_xmit = &pcnet32_start_xmit; - dev->stop = &pcnet32_close; - dev->get_stats = &pcnet32_get_stats; - dev->set_multicast_list = &pcnet32_set_multicast_list; - dev->do_ioctl = &pcnet32_ioctl; - dev->ethtool_ops = &pcnet32_ethtool_ops; - dev->tx_timeout = pcnet32_tx_timeout; - dev->watchdog_timeo = (5*HZ); + + init_timer(&lp->watchdog_timer); + lp->watchdog_timer.data = (unsigned long)dev; + lp->watchdog_timer.function = (void *)&pcnet32_watchdog; + + /* The PCNET32-specific entries in the device structure. */ + dev->open = &pcnet32_open; + dev->hard_start_xmit = &pcnet32_start_xmit; + dev->stop = &pcnet32_close; + dev->get_stats = &pcnet32_get_stats; + dev->set_multicast_list = &pcnet32_set_multicast_list; + dev->do_ioctl = &pcnet32_ioctl; + dev->ethtool_ops = &pcnet32_ethtool_ops; + dev->tx_timeout = pcnet32_tx_timeout; + dev->watchdog_timeo = (5 * HZ); #ifdef CONFIG_NET_POLL_CONTROLLER - dev->poll_controller = pcnet32_poll_controller; + dev->poll_controller = pcnet32_poll_controller; #endif - /* Fill in the generic fields of the device structure. */ - if (register_netdev(dev)) - goto err_free_ring; - - if (pdev) { - pci_set_drvdata(pdev, dev); - } else { - lp->next = pcnet32_dev; - pcnet32_dev = dev; - } - - if (pcnet32_debug & NETIF_MSG_PROBE) - printk(KERN_INFO "%s: registered as %s\n", dev->name, lp->name); - cards_found++; - - /* enable LED writes */ - a->write_bcr(ioaddr, 2, a->read_bcr(ioaddr, 2) | 0x1000); - - return 0; - -err_free_ring: - pcnet32_free_ring(dev); -err_free_consistent: - pci_free_consistent(lp->pci_dev, sizeof(*lp), lp, lp->dma_addr); -err_free_netdev: - free_netdev(dev); -err_release_region: - release_region(ioaddr, PCNET32_TOTAL_SIZE); - return ret; -} + /* Fill in the generic fields of the device structure. */ + if (register_netdev(dev)) + goto err_free_ring; + + if (pdev) { + pci_set_drvdata(pdev, dev); + } else { + lp->next = pcnet32_dev; + pcnet32_dev = dev; + } + if (pcnet32_debug & NETIF_MSG_PROBE) + printk(KERN_INFO "%s: registered as %s\n", dev->name, lp->name); + cards_found++; + + /* enable LED writes */ + a->write_bcr(ioaddr, 2, a->read_bcr(ioaddr, 2) | 0x1000); + + return 0; + + err_free_ring: + pcnet32_free_ring(dev); + err_free_consistent: + pci_free_consistent(lp->pci_dev, sizeof(*lp), lp, lp->dma_addr); + err_free_netdev: + free_netdev(dev); + err_release_region: + release_region(ioaddr, PCNET32_TOTAL_SIZE); + return ret; +} /* if any allocation fails, caller must also call pcnet32_free_ring */ static int pcnet32_alloc_ring(struct net_device *dev, char *name) { - struct pcnet32_private *lp = dev->priv; + struct pcnet32_private *lp = dev->priv; - lp->tx_ring = pci_alloc_consistent(lp->pci_dev, - sizeof(struct pcnet32_tx_head) * lp->tx_ring_size, - &lp->tx_ring_dma_addr); - if (lp->tx_ring == NULL) { - if (pcnet32_debug & NETIF_MSG_DRV) - printk("\n" KERN_ERR PFX "%s: Consistent memory allocation failed.\n", - name); - return -ENOMEM; - } - - lp->rx_ring = pci_alloc_consistent(lp->pci_dev, - sizeof(struct pcnet32_rx_head) * lp->rx_ring_size, - &lp->rx_ring_dma_addr); - if (lp->rx_ring == NULL) { - if (pcnet32_debug & NETIF_MSG_DRV) - printk("\n" KERN_ERR PFX "%s: Consistent memory allocation failed.\n", - name); - return -ENOMEM; - } - - lp->tx_dma_addr = kmalloc(sizeof(dma_addr_t) * lp->tx_ring_size, - GFP_ATOMIC); - if (!lp->tx_dma_addr) { - if (pcnet32_debug & NETIF_MSG_DRV) - printk("\n" KERN_ERR PFX "%s: Memory allocation failed.\n", name); - return -ENOMEM; - } - memset(lp->tx_dma_addr, 0, sizeof(dma_addr_t) * lp->tx_ring_size); - - lp->rx_dma_addr = kmalloc(sizeof(dma_addr_t) * lp->rx_ring_size, - GFP_ATOMIC); - if (!lp->rx_dma_addr) { - if (pcnet32_debug & NETIF_MSG_DRV) - printk("\n" KERN_ERR PFX "%s: Memory allocation failed.\n", name); - return -ENOMEM; - } - memset(lp->rx_dma_addr, 0, sizeof(dma_addr_t) * lp->rx_ring_size); - - lp->tx_skbuff = kmalloc(sizeof(struct sk_buff *) * lp->tx_ring_size, - GFP_ATOMIC); - if (!lp->tx_skbuff) { - if (pcnet32_debug & NETIF_MSG_DRV) - printk("\n" KERN_ERR PFX "%s: Memory allocation failed.\n", name); - return -ENOMEM; - } - memset(lp->tx_skbuff, 0, sizeof(struct sk_buff *) * lp->tx_ring_size); - - lp->rx_skbuff = kmalloc(sizeof(struct sk_buff *) * lp->rx_ring_size, - GFP_ATOMIC); - if (!lp->rx_skbuff) { - if (pcnet32_debug & NETIF_MSG_DRV) - printk("\n" KERN_ERR PFX "%s: Memory allocation failed.\n", name); - return -ENOMEM; - } - memset(lp->rx_skbuff, 0, sizeof(struct sk_buff *) * lp->rx_ring_size); + lp->tx_ring = pci_alloc_consistent(lp->pci_dev, + sizeof(struct pcnet32_tx_head) * + lp->tx_ring_size, + &lp->tx_ring_dma_addr); + if (lp->tx_ring == NULL) { + if (pcnet32_debug & NETIF_MSG_DRV) + printk("\n" KERN_ERR PFX + "%s: Consistent memory allocation failed.\n", + name); + return -ENOMEM; + } - return 0; -} + lp->rx_ring = pci_alloc_consistent(lp->pci_dev, + sizeof(struct pcnet32_rx_head) * + lp->rx_ring_size, + &lp->rx_ring_dma_addr); + if (lp->rx_ring == NULL) { + if (pcnet32_debug & NETIF_MSG_DRV) + printk("\n" KERN_ERR PFX + "%s: Consistent memory allocation failed.\n", + name); + return -ENOMEM; + } + lp->tx_dma_addr = kmalloc(sizeof(dma_addr_t) * lp->tx_ring_size, + GFP_ATOMIC); + if (!lp->tx_dma_addr) { + if (pcnet32_debug & NETIF_MSG_DRV) + printk("\n" KERN_ERR PFX + "%s: Memory allocation failed.\n", name); + return -ENOMEM; + } + memset(lp->tx_dma_addr, 0, sizeof(dma_addr_t) * lp->tx_ring_size); + + lp->rx_dma_addr = kmalloc(sizeof(dma_addr_t) * lp->rx_ring_size, + GFP_ATOMIC); + if (!lp->rx_dma_addr) { + if (pcnet32_debug & NETIF_MSG_DRV) + printk("\n" KERN_ERR PFX + "%s: Memory allocation failed.\n", name); + return -ENOMEM; + } + memset(lp->rx_dma_addr, 0, sizeof(dma_addr_t) * lp->rx_ring_size); + + lp->tx_skbuff = kmalloc(sizeof(struct sk_buff *) * lp->tx_ring_size, + GFP_ATOMIC); + if (!lp->tx_skbuff) { + if (pcnet32_debug & NETIF_MSG_DRV) + printk("\n" KERN_ERR PFX + "%s: Memory allocation failed.\n", name); + return -ENOMEM; + } + memset(lp->tx_skbuff, 0, sizeof(struct sk_buff *) * lp->tx_ring_size); + + lp->rx_skbuff = kmalloc(sizeof(struct sk_buff *) * lp->rx_ring_size, + GFP_ATOMIC); + if (!lp->rx_skbuff) { + if (pcnet32_debug & NETIF_MSG_DRV) + printk("\n" KERN_ERR PFX + "%s: Memory allocation failed.\n", name); + return -ENOMEM; + } + memset(lp->rx_skbuff, 0, sizeof(struct sk_buff *) * lp->rx_ring_size); + + return 0; +} static void pcnet32_free_ring(struct net_device *dev) { - struct pcnet32_private *lp = dev->priv; + struct pcnet32_private *lp = dev->priv; - kfree(lp->tx_skbuff); - lp->tx_skbuff = NULL; + kfree(lp->tx_skbuff); + lp->tx_skbuff = NULL; - kfree(lp->rx_skbuff); - lp->rx_skbuff = NULL; + kfree(lp->rx_skbuff); + lp->rx_skbuff = NULL; - kfree(lp->tx_dma_addr); - lp->tx_dma_addr = NULL; + kfree(lp->tx_dma_addr); + lp->tx_dma_addr = NULL; - kfree(lp->rx_dma_addr); - lp->rx_dma_addr = NULL; + kfree(lp->rx_dma_addr); + lp->rx_dma_addr = NULL; - if (lp->tx_ring) { - pci_free_consistent(lp->pci_dev, sizeof(struct pcnet32_tx_head) * lp->tx_ring_size, - lp->tx_ring, lp->tx_ring_dma_addr); - lp->tx_ring = NULL; - } + if (lp->tx_ring) { + pci_free_consistent(lp->pci_dev, + sizeof(struct pcnet32_tx_head) * + lp->tx_ring_size, lp->tx_ring, + lp->tx_ring_dma_addr); + lp->tx_ring = NULL; + } - if (lp->rx_ring) { - pci_free_consistent(lp->pci_dev, sizeof(struct pcnet32_rx_head) * lp->rx_ring_size, - lp->rx_ring, lp->rx_ring_dma_addr); - lp->rx_ring = NULL; - } + if (lp->rx_ring) { + pci_free_consistent(lp->pci_dev, + sizeof(struct pcnet32_rx_head) * + lp->rx_ring_size, lp->rx_ring, + lp->rx_ring_dma_addr); + lp->rx_ring = NULL; + } } - -static int -pcnet32_open(struct net_device *dev) +static int pcnet32_open(struct net_device *dev) { - struct pcnet32_private *lp = dev->priv; - unsigned long ioaddr = dev->base_addr; - u16 val; - int i; - int rc; - unsigned long flags; - - if (request_irq(dev->irq, &pcnet32_interrupt, - lp->shared_irq ? SA_SHIRQ : 0, dev->name, (void *)dev)) { - return -EAGAIN; - } - - spin_lock_irqsave(&lp->lock, flags); - /* Check for a valid station address */ - if (!is_valid_ether_addr(dev->dev_addr)) { - rc = -EINVAL; - goto err_free_irq; - } - - /* Reset the PCNET32 */ - lp->a.reset (ioaddr); - - /* switch pcnet32 to 32bit mode */ - lp->a.write_bcr (ioaddr, 20, 2); - - if (netif_msg_ifup(lp)) - printk(KERN_DEBUG "%s: pcnet32_open() irq %d tx/rx rings %#x/%#x init %#x.\n", - dev->name, dev->irq, - (u32) (lp->tx_ring_dma_addr), - (u32) (lp->rx_ring_dma_addr), - (u32) (lp->dma_addr + offsetof(struct pcnet32_private, init_block))); - - /* set/reset autoselect bit */ - val = lp->a.read_bcr (ioaddr, 2) & ~2; - if (lp->options & PCNET32_PORT_ASEL) - val |= 2; - lp->a.write_bcr (ioaddr, 2, val); - - /* handle full duplex setting */ - if (lp->mii_if.full_duplex) { - val = lp->a.read_bcr (ioaddr, 9) & ~3; - if (lp->options & PCNET32_PORT_FD) { - val |= 1; - if (lp->options == (PCNET32_PORT_FD | PCNET32_PORT_AUI)) + struct pcnet32_private *lp = dev->priv; + unsigned long ioaddr = dev->base_addr; + u16 val; + int i; + int rc; + unsigned long flags; + + if (request_irq(dev->irq, &pcnet32_interrupt, + lp->shared_irq ? SA_SHIRQ : 0, dev->name, + (void *)dev)) { + return -EAGAIN; + } + + spin_lock_irqsave(&lp->lock, flags); + /* Check for a valid station address */ + if (!is_valid_ether_addr(dev->dev_addr)) { + rc = -EINVAL; + goto err_free_irq; + } + + /* Reset the PCNET32 */ + lp->a.reset(ioaddr); + + /* switch pcnet32 to 32bit mode */ + lp->a.write_bcr(ioaddr, 20, 2); + + if (netif_msg_ifup(lp)) + printk(KERN_DEBUG + "%s: pcnet32_open() irq %d tx/rx rings %#x/%#x init %#x.\n", + dev->name, dev->irq, (u32) (lp->tx_ring_dma_addr), + (u32) (lp->rx_ring_dma_addr), + (u32) (lp->dma_addr + + offsetof(struct pcnet32_private, init_block))); + + /* set/reset autoselect bit */ + val = lp->a.read_bcr(ioaddr, 2) & ~2; + if (lp->options & PCNET32_PORT_ASEL) val |= 2; - } else if (lp->options & PCNET32_PORT_ASEL) { - /* workaround of xSeries250, turn on for 79C975 only */ - i = ((lp->a.read_csr(ioaddr, 88) | - (lp->a.read_csr(ioaddr,89) << 16)) >> 12) & 0xffff; - if (i == 0x2627) - val |= 3; - } - lp->a.write_bcr (ioaddr, 9, val); - } - - /* set/reset GPSI bit in test register */ - val = lp->a.read_csr (ioaddr, 124) & ~0x10; - if ((lp->options & PCNET32_PORT_PORTSEL) == PCNET32_PORT_GPSI) - val |= 0x10; - lp->a.write_csr (ioaddr, 124, val); - - /* Allied Telesyn AT 2700/2701 FX are 100Mbit only and do not negotiate */ - if (lp->pci_dev->subsystem_vendor == PCI_VENDOR_ID_AT && + lp->a.write_bcr(ioaddr, 2, val); + + /* handle full duplex setting */ + if (lp->mii_if.full_duplex) { + val = lp->a.read_bcr(ioaddr, 9) & ~3; + if (lp->options & PCNET32_PORT_FD) { + val |= 1; + if (lp->options == (PCNET32_PORT_FD | PCNET32_PORT_AUI)) + val |= 2; + } else if (lp->options & PCNET32_PORT_ASEL) { + /* workaround of xSeries250, turn on for 79C975 only */ + i = ((lp->a.read_csr(ioaddr, 88) | + (lp->a. + read_csr(ioaddr, 89) << 16)) >> 12) & 0xffff; + if (i == 0x2627) + val |= 3; + } + lp->a.write_bcr(ioaddr, 9, val); + } + + /* set/reset GPSI bit in test register */ + val = lp->a.read_csr(ioaddr, 124) & ~0x10; + if ((lp->options & PCNET32_PORT_PORTSEL) == PCNET32_PORT_GPSI) + val |= 0x10; + lp->a.write_csr(ioaddr, 124, val); + + /* Allied Telesyn AT 2700/2701 FX are 100Mbit only and do not negotiate */ + if (lp->pci_dev->subsystem_vendor == PCI_VENDOR_ID_AT && (lp->pci_dev->subsystem_device == PCI_SUBDEVICE_ID_AT_2700FX || lp->pci_dev->subsystem_device == PCI_SUBDEVICE_ID_AT_2701FX)) { - if (lp->options & PCNET32_PORT_ASEL) { - lp->options = PCNET32_PORT_FD | PCNET32_PORT_100; - if (netif_msg_link(lp)) - printk(KERN_DEBUG "%s: Setting 100Mb-Full Duplex.\n", - dev->name); - } - } - { - /* - * 24 Jun 2004 according AMD, in order to change the PHY, - * DANAS (or DISPM for 79C976) must be set; then select the speed, - * duplex, and/or enable auto negotiation, and clear DANAS - */ - if (lp->mii && !(lp->options & PCNET32_PORT_ASEL)) { - lp->a.write_bcr(ioaddr, 32, - lp->a.read_bcr(ioaddr, 32) | 0x0080); - /* disable Auto Negotiation, set 10Mpbs, HD */ - val = lp->a.read_bcr(ioaddr, 32) & ~0xb8; - if (lp->options & PCNET32_PORT_FD) - val |= 0x10; - if (lp->options & PCNET32_PORT_100) - val |= 0x08; - lp->a.write_bcr (ioaddr, 32, val); + if (lp->options & PCNET32_PORT_ASEL) { + lp->options = PCNET32_PORT_FD | PCNET32_PORT_100; + if (netif_msg_link(lp)) + printk(KERN_DEBUG + "%s: Setting 100Mb-Full Duplex.\n", + dev->name); + } + } + if (lp->phycount < 2) { + /* + * 24 Jun 2004 according AMD, in order to change the PHY, + * DANAS (or DISPM for 79C976) must be set; then select the speed, + * duplex, and/or enable auto negotiation, and clear DANAS + */ + if (lp->mii && !(lp->options & PCNET32_PORT_ASEL)) { + lp->a.write_bcr(ioaddr, 32, + lp->a.read_bcr(ioaddr, 32) | 0x0080); + /* disable Auto Negotiation, set 10Mpbs, HD */ + val = lp->a.read_bcr(ioaddr, 32) & ~0xb8; + if (lp->options & PCNET32_PORT_FD) + val |= 0x10; + if (lp->options & PCNET32_PORT_100) + val |= 0x08; + lp->a.write_bcr(ioaddr, 32, val); + } else { + if (lp->options & PCNET32_PORT_ASEL) { + lp->a.write_bcr(ioaddr, 32, + lp->a.read_bcr(ioaddr, + 32) | 0x0080); + /* enable auto negotiate, setup, disable fd */ + val = lp->a.read_bcr(ioaddr, 32) & ~0x98; + val |= 0x20; + lp->a.write_bcr(ioaddr, 32, val); + } + } } else { - if (lp->options & PCNET32_PORT_ASEL) { - lp->a.write_bcr(ioaddr, 32, - lp->a.read_bcr(ioaddr, 32) | 0x0080); - /* enable auto negotiate, setup, disable fd */ - val = lp->a.read_bcr(ioaddr, 32) & ~0x98; - val |= 0x20; - lp->a.write_bcr(ioaddr, 32, val); - } + int first_phy = -1; + u16 bmcr; + u32 bcr9; + struct ethtool_cmd ecmd; + + /* + * There is really no good other way to handle multiple PHYs + * other than turning off all automatics + */ + val = lp->a.read_bcr(ioaddr, 2); + lp->a.write_bcr(ioaddr, 2, val & ~2); + val = lp->a.read_bcr(ioaddr, 32); + lp->a.write_bcr(ioaddr, 32, val & ~(1 << 7)); /* stop MII manager */ + + if (!(lp->options & PCNET32_PORT_ASEL)) { + /* setup ecmd */ + ecmd.port = PORT_MII; + ecmd.transceiver = XCVR_INTERNAL; + ecmd.autoneg = AUTONEG_DISABLE; + ecmd.speed = + lp-> + options & PCNET32_PORT_100 ? SPEED_100 : SPEED_10; + bcr9 = lp->a.read_bcr(ioaddr, 9); + + if (lp->options & PCNET32_PORT_FD) { + ecmd.duplex = DUPLEX_FULL; + bcr9 |= (1 << 0); + } else { + ecmd.duplex = DUPLEX_HALF; + bcr9 |= ~(1 << 0); + } + lp->a.write_bcr(ioaddr, 9, bcr9); + } + + for (i = 0; i < PCNET32_MAX_PHYS; i++) { + if (lp->phymask & (1 << i)) { + /* isolate all but the first PHY */ + bmcr = mdio_read(dev, i, MII_BMCR); + if (first_phy == -1) { + first_phy = i; + mdio_write(dev, i, MII_BMCR, + bmcr & ~BMCR_ISOLATE); + } else { + mdio_write(dev, i, MII_BMCR, + bmcr | BMCR_ISOLATE); + } + /* use mii_ethtool_sset to setup PHY */ + lp->mii_if.phy_id = i; + ecmd.phy_address = i; + if (lp->options & PCNET32_PORT_ASEL) { + mii_ethtool_gset(&lp->mii_if, &ecmd); + ecmd.autoneg = AUTONEG_ENABLE; + } + mii_ethtool_sset(&lp->mii_if, &ecmd); + } + } + lp->mii_if.phy_id = first_phy; + if (netif_msg_link(lp)) + printk(KERN_INFO "%s: Using PHY number %d.\n", + dev->name, first_phy); } - } #ifdef DO_DXSUFLO - if (lp->dxsuflo) { /* Disable transmit stop on underflow */ - val = lp->a.read_csr (ioaddr, 3); - val |= 0x40; - lp->a.write_csr (ioaddr, 3, val); - } + if (lp->dxsuflo) { /* Disable transmit stop on underflow */ + val = lp->a.read_csr(ioaddr, 3); + val |= 0x40; + lp->a.write_csr(ioaddr, 3, val); + } #endif - lp->init_block.mode = le16_to_cpu((lp->options & PCNET32_PORT_PORTSEL) << 7); - pcnet32_load_multicast(dev); - - if (pcnet32_init_ring(dev)) { - rc = -ENOMEM; - goto err_free_ring; - } - - /* Re-initialize the PCNET32, and start it when done. */ - lp->a.write_csr (ioaddr, 1, (lp->dma_addr + - offsetof(struct pcnet32_private, init_block)) & 0xffff); - lp->a.write_csr (ioaddr, 2, (lp->dma_addr + - offsetof(struct pcnet32_private, init_block)) >> 16); - - lp->a.write_csr (ioaddr, 4, 0x0915); - lp->a.write_csr (ioaddr, 0, 0x0001); - - netif_start_queue(dev); - - /* If we have mii, print the link status and start the watchdog */ - if (lp->mii) { - mii_check_media (&lp->mii_if, netif_msg_link(lp), 1); - mod_timer (&(lp->watchdog_timer), PCNET32_WATCHDOG_TIMEOUT); - } - - i = 0; - while (i++ < 100) - if (lp->a.read_csr (ioaddr, 0) & 0x0100) - break; - /* - * We used to clear the InitDone bit, 0x0100, here but Mark Stockton - * reports that doing so triggers a bug in the '974. - */ - lp->a.write_csr (ioaddr, 0, 0x0042); - - if (netif_msg_ifup(lp)) - printk(KERN_DEBUG "%s: pcnet32 open after %d ticks, init block %#x csr0 %4.4x.\n", - dev->name, i, (u32) (lp->dma_addr + - offsetof(struct pcnet32_private, init_block)), - lp->a.read_csr(ioaddr, 0)); - - spin_unlock_irqrestore(&lp->lock, flags); - - return 0; /* Always succeed */ - -err_free_ring: - /* free any allocated skbuffs */ - for (i = 0; i < lp->rx_ring_size; i++) { - lp->rx_ring[i].status = 0; - if (lp->rx_skbuff[i]) { - pci_unmap_single(lp->pci_dev, lp->rx_dma_addr[i], PKT_BUF_SZ-2, - PCI_DMA_FROMDEVICE); - dev_kfree_skb(lp->rx_skbuff[i]); - } - lp->rx_skbuff[i] = NULL; - lp->rx_dma_addr[i] = 0; - } - - pcnet32_free_ring(dev); - - /* - * Switch back to 16bit mode to avoid problems with dumb - * DOS packet driver after a warm reboot - */ - lp->a.write_bcr (ioaddr, 20, 4); - -err_free_irq: - spin_unlock_irqrestore(&lp->lock, flags); - free_irq(dev->irq, dev); - return rc; + lp->init_block.mode = + le16_to_cpu((lp->options & PCNET32_PORT_PORTSEL) << 7); + pcnet32_load_multicast(dev); + + if (pcnet32_init_ring(dev)) { + rc = -ENOMEM; + goto err_free_ring; + } + + /* Re-initialize the PCNET32, and start it when done. */ + lp->a.write_csr(ioaddr, 1, (lp->dma_addr + + offsetof(struct pcnet32_private, + init_block)) & 0xffff); + lp->a.write_csr(ioaddr, 2, + (lp->dma_addr + + offsetof(struct pcnet32_private, init_block)) >> 16); + + lp->a.write_csr(ioaddr, 4, 0x0915); + lp->a.write_csr(ioaddr, 0, 0x0001); + + netif_start_queue(dev); + + /* Print the link status and start the watchdog */ + pcnet32_check_media(dev, 1); + mod_timer(&(lp->watchdog_timer), PCNET32_WATCHDOG_TIMEOUT); + + i = 0; + while (i++ < 100) + if (lp->a.read_csr(ioaddr, 0) & 0x0100) + break; + /* + * We used to clear the InitDone bit, 0x0100, here but Mark Stockton + * reports that doing so triggers a bug in the '974. + */ + lp->a.write_csr(ioaddr, 0, 0x0042); + + if (netif_msg_ifup(lp)) + printk(KERN_DEBUG + "%s: pcnet32 open after %d ticks, init block %#x csr0 %4.4x.\n", + dev->name, i, + (u32) (lp->dma_addr + + offsetof(struct pcnet32_private, init_block)), + lp->a.read_csr(ioaddr, 0)); + + spin_unlock_irqrestore(&lp->lock, flags); + + return 0; /* Always succeed */ + + err_free_ring: + /* free any allocated skbuffs */ + for (i = 0; i < lp->rx_ring_size; i++) { + lp->rx_ring[i].status = 0; + if (lp->rx_skbuff[i]) { + pci_unmap_single(lp->pci_dev, lp->rx_dma_addr[i], + PKT_BUF_SZ - 2, PCI_DMA_FROMDEVICE); + dev_kfree_skb(lp->rx_skbuff[i]); + } + lp->rx_skbuff[i] = NULL; + lp->rx_dma_addr[i] = 0; + } + + pcnet32_free_ring(dev); + + /* + * Switch back to 16bit mode to avoid problems with dumb + * DOS packet driver after a warm reboot + */ + lp->a.write_bcr(ioaddr, 20, 4); + + err_free_irq: + spin_unlock_irqrestore(&lp->lock, flags); + free_irq(dev->irq, dev); + return rc; } /* @@ -1746,727 +1801,893 @@ err_free_irq: * restarting the chip, but I'm too lazy to do so right now. dplatt@3do.com */ -static void -pcnet32_purge_tx_ring(struct net_device *dev) +static void pcnet32_purge_tx_ring(struct net_device *dev) { - struct pcnet32_private *lp = dev->priv; - int i; - - for (i = 0; i < lp->tx_ring_size; i++) { - lp->tx_ring[i].status = 0; /* CPU owns buffer */ - wmb(); /* Make sure adapter sees owner change */ - if (lp->tx_skbuff[i]) { - pci_unmap_single(lp->pci_dev, lp->tx_dma_addr[i], - lp->tx_skbuff[i]->len, PCI_DMA_TODEVICE); - dev_kfree_skb_any(lp->tx_skbuff[i]); - } - lp->tx_skbuff[i] = NULL; - lp->tx_dma_addr[i] = 0; - } -} + struct pcnet32_private *lp = dev->priv; + int i; + for (i = 0; i < lp->tx_ring_size; i++) { + lp->tx_ring[i].status = 0; /* CPU owns buffer */ + wmb(); /* Make sure adapter sees owner change */ + if (lp->tx_skbuff[i]) { + pci_unmap_single(lp->pci_dev, lp->tx_dma_addr[i], + lp->tx_skbuff[i]->len, + PCI_DMA_TODEVICE); + dev_kfree_skb_any(lp->tx_skbuff[i]); + } + lp->tx_skbuff[i] = NULL; + lp->tx_dma_addr[i] = 0; + } +} /* Initialize the PCNET32 Rx and Tx rings. */ -static int -pcnet32_init_ring(struct net_device *dev) +static int pcnet32_init_ring(struct net_device *dev) { - struct pcnet32_private *lp = dev->priv; - int i; - - lp->tx_full = 0; - lp->cur_rx = lp->cur_tx = 0; - lp->dirty_rx = lp->dirty_tx = 0; - - for (i = 0; i < lp->rx_ring_size; i++) { - struct sk_buff *rx_skbuff = lp->rx_skbuff[i]; - if (rx_skbuff == NULL) { - if (!(rx_skbuff = lp->rx_skbuff[i] = dev_alloc_skb (PKT_BUF_SZ))) { - /* there is not much, we can do at this point */ - if (pcnet32_debug & NETIF_MSG_DRV) - printk(KERN_ERR "%s: pcnet32_init_ring dev_alloc_skb failed.\n", - dev->name); - return -1; - } - skb_reserve (rx_skbuff, 2); - } - - rmb(); - if (lp->rx_dma_addr[i] == 0) - lp->rx_dma_addr[i] = pci_map_single(lp->pci_dev, rx_skbuff->data, - PKT_BUF_SZ-2, PCI_DMA_FROMDEVICE); - lp->rx_ring[i].base = (u32)le32_to_cpu(lp->rx_dma_addr[i]); - lp->rx_ring[i].buf_length = le16_to_cpu(2-PKT_BUF_SZ); - wmb(); /* Make sure owner changes after all others are visible */ - lp->rx_ring[i].status = le16_to_cpu(0x8000); - } - /* The Tx buffer address is filled in as needed, but we do need to clear - * the upper ownership bit. */ - for (i = 0; i < lp->tx_ring_size; i++) { - lp->tx_ring[i].status = 0; /* CPU owns buffer */ - wmb(); /* Make sure adapter sees owner change */ - lp->tx_ring[i].base = 0; - lp->tx_dma_addr[i] = 0; - } - - lp->init_block.tlen_rlen = le16_to_cpu(lp->tx_len_bits | lp->rx_len_bits); - for (i = 0; i < 6; i++) - lp->init_block.phys_addr[i] = dev->dev_addr[i]; - lp->init_block.rx_ring = (u32)le32_to_cpu(lp->rx_ring_dma_addr); - lp->init_block.tx_ring = (u32)le32_to_cpu(lp->tx_ring_dma_addr); - wmb(); /* Make sure all changes are visible */ - return 0; + struct pcnet32_private *lp = dev->priv; + int i; + + lp->tx_full = 0; + lp->cur_rx = lp->cur_tx = 0; + lp->dirty_rx = lp->dirty_tx = 0; + + for (i = 0; i < lp->rx_ring_size; i++) { + struct sk_buff *rx_skbuff = lp->rx_skbuff[i]; + if (rx_skbuff == NULL) { + if (! + (rx_skbuff = lp->rx_skbuff[i] = + dev_alloc_skb(PKT_BUF_SZ))) { + /* there is not much, we can do at this point */ + if (pcnet32_debug & NETIF_MSG_DRV) + printk(KERN_ERR + "%s: pcnet32_init_ring dev_alloc_skb failed.\n", + dev->name); + return -1; + } + skb_reserve(rx_skbuff, 2); + } + + rmb(); + if (lp->rx_dma_addr[i] == 0) + lp->rx_dma_addr[i] = + pci_map_single(lp->pci_dev, rx_skbuff->data, + PKT_BUF_SZ - 2, PCI_DMA_FROMDEVICE); + lp->rx_ring[i].base = (u32) le32_to_cpu(lp->rx_dma_addr[i]); + lp->rx_ring[i].buf_length = le16_to_cpu(2 - PKT_BUF_SZ); + wmb(); /* Make sure owner changes after all others are visible */ + lp->rx_ring[i].status = le16_to_cpu(0x8000); + } + /* The Tx buffer address is filled in as needed, but we do need to clear + * the upper ownership bit. */ + for (i = 0; i < lp->tx_ring_size; i++) { + lp->tx_ring[i].status = 0; /* CPU owns buffer */ + wmb(); /* Make sure adapter sees owner change */ + lp->tx_ring[i].base = 0; + lp->tx_dma_addr[i] = 0; + } + + lp->init_block.tlen_rlen = + le16_to_cpu(lp->tx_len_bits | lp->rx_len_bits); + for (i = 0; i < 6; i++) + lp->init_block.phys_addr[i] = dev->dev_addr[i]; + lp->init_block.rx_ring = (u32) le32_to_cpu(lp->rx_ring_dma_addr); + lp->init_block.tx_ring = (u32) le32_to_cpu(lp->tx_ring_dma_addr); + wmb(); /* Make sure all changes are visible */ + return 0; } /* the pcnet32 has been issued a stop or reset. Wait for the stop bit * then flush the pending transmit operations, re-initialize the ring, * and tell the chip to initialize. */ -static void -pcnet32_restart(struct net_device *dev, unsigned int csr0_bits) +static void pcnet32_restart(struct net_device *dev, unsigned int csr0_bits) { - struct pcnet32_private *lp = dev->priv; - unsigned long ioaddr = dev->base_addr; - int i; + struct pcnet32_private *lp = dev->priv; + unsigned long ioaddr = dev->base_addr; + int i; - /* wait for stop */ - for (i=0; i<100; i++) - if (lp->a.read_csr(ioaddr, 0) & 0x0004) - break; + /* wait for stop */ + for (i = 0; i < 100; i++) + if (lp->a.read_csr(ioaddr, 0) & 0x0004) + break; - if (i >= 100 && netif_msg_drv(lp)) - printk(KERN_ERR "%s: pcnet32_restart timed out waiting for stop.\n", - dev->name); + if (i >= 100 && netif_msg_drv(lp)) + printk(KERN_ERR + "%s: pcnet32_restart timed out waiting for stop.\n", + dev->name); - pcnet32_purge_tx_ring(dev); - if (pcnet32_init_ring(dev)) - return; + pcnet32_purge_tx_ring(dev); + if (pcnet32_init_ring(dev)) + return; - /* ReInit Ring */ - lp->a.write_csr (ioaddr, 0, 1); - i = 0; - while (i++ < 1000) - if (lp->a.read_csr (ioaddr, 0) & 0x0100) - break; + /* ReInit Ring */ + lp->a.write_csr(ioaddr, 0, 1); + i = 0; + while (i++ < 1000) + if (lp->a.read_csr(ioaddr, 0) & 0x0100) + break; - lp->a.write_csr (ioaddr, 0, csr0_bits); + lp->a.write_csr(ioaddr, 0, csr0_bits); } - -static void -pcnet32_tx_timeout (struct net_device *dev) +static void pcnet32_tx_timeout(struct net_device *dev) { - struct pcnet32_private *lp = dev->priv; - unsigned long ioaddr = dev->base_addr, flags; - - spin_lock_irqsave(&lp->lock, flags); - /* Transmitter timeout, serious problems. */ - if (pcnet32_debug & NETIF_MSG_DRV) - printk(KERN_ERR "%s: transmit timed out, status %4.4x, resetting.\n", - dev->name, lp->a.read_csr(ioaddr, 0)); - lp->a.write_csr (ioaddr, 0, 0x0004); - lp->stats.tx_errors++; - if (netif_msg_tx_err(lp)) { - int i; - printk(KERN_DEBUG " Ring data dump: dirty_tx %d cur_tx %d%s cur_rx %d.", - lp->dirty_tx, lp->cur_tx, lp->tx_full ? " (full)" : "", - lp->cur_rx); - for (i = 0 ; i < lp->rx_ring_size; i++) - printk("%s %08x %04x %08x %04x", i & 1 ? "" : "\n ", - le32_to_cpu(lp->rx_ring[i].base), - (-le16_to_cpu(lp->rx_ring[i].buf_length)) & 0xffff, - le32_to_cpu(lp->rx_ring[i].msg_length), - le16_to_cpu(lp->rx_ring[i].status)); - for (i = 0 ; i < lp->tx_ring_size; i++) - printk("%s %08x %04x %08x %04x", i & 1 ? "" : "\n ", - le32_to_cpu(lp->tx_ring[i].base), - (-le16_to_cpu(lp->tx_ring[i].length)) & 0xffff, - le32_to_cpu(lp->tx_ring[i].misc), - le16_to_cpu(lp->tx_ring[i].status)); - printk("\n"); - } - pcnet32_restart(dev, 0x0042); - - dev->trans_start = jiffies; - netif_wake_queue(dev); - - spin_unlock_irqrestore(&lp->lock, flags); -} + struct pcnet32_private *lp = dev->priv; + unsigned long ioaddr = dev->base_addr, flags; + + spin_lock_irqsave(&lp->lock, flags); + /* Transmitter timeout, serious problems. */ + if (pcnet32_debug & NETIF_MSG_DRV) + printk(KERN_ERR + "%s: transmit timed out, status %4.4x, resetting.\n", + dev->name, lp->a.read_csr(ioaddr, 0)); + lp->a.write_csr(ioaddr, 0, 0x0004); + lp->stats.tx_errors++; + if (netif_msg_tx_err(lp)) { + int i; + printk(KERN_DEBUG + " Ring data dump: dirty_tx %d cur_tx %d%s cur_rx %d.", + lp->dirty_tx, lp->cur_tx, lp->tx_full ? " (full)" : "", + lp->cur_rx); + for (i = 0; i < lp->rx_ring_size; i++) + printk("%s %08x %04x %08x %04x", i & 1 ? "" : "\n ", + le32_to_cpu(lp->rx_ring[i].base), + (-le16_to_cpu(lp->rx_ring[i].buf_length)) & + 0xffff, le32_to_cpu(lp->rx_ring[i].msg_length), + le16_to_cpu(lp->rx_ring[i].status)); + for (i = 0; i < lp->tx_ring_size; i++) + printk("%s %08x %04x %08x %04x", i & 1 ? "" : "\n ", + le32_to_cpu(lp->tx_ring[i].base), + (-le16_to_cpu(lp->tx_ring[i].length)) & 0xffff, + le32_to_cpu(lp->tx_ring[i].misc), + le16_to_cpu(lp->tx_ring[i].status)); + printk("\n"); + } + pcnet32_restart(dev, 0x0042); + + dev->trans_start = jiffies; + netif_wake_queue(dev); + spin_unlock_irqrestore(&lp->lock, flags); +} -static int -pcnet32_start_xmit(struct sk_buff *skb, struct net_device *dev) +static int pcnet32_start_xmit(struct sk_buff *skb, struct net_device *dev) { - struct pcnet32_private *lp = dev->priv; - unsigned long ioaddr = dev->base_addr; - u16 status; - int entry; - unsigned long flags; + struct pcnet32_private *lp = dev->priv; + unsigned long ioaddr = dev->base_addr; + u16 status; + int entry; + unsigned long flags; - spin_lock_irqsave(&lp->lock, flags); + spin_lock_irqsave(&lp->lock, flags); - if (netif_msg_tx_queued(lp)) { - printk(KERN_DEBUG "%s: pcnet32_start_xmit() called, csr0 %4.4x.\n", - dev->name, lp->a.read_csr(ioaddr, 0)); - } + if (netif_msg_tx_queued(lp)) { + printk(KERN_DEBUG + "%s: pcnet32_start_xmit() called, csr0 %4.4x.\n", + dev->name, lp->a.read_csr(ioaddr, 0)); + } - /* Default status -- will not enable Successful-TxDone - * interrupt when that option is available to us. - */ - status = 0x8300; + /* Default status -- will not enable Successful-TxDone + * interrupt when that option is available to us. + */ + status = 0x8300; - /* Fill in a Tx ring entry */ + /* Fill in a Tx ring entry */ - /* Mask to ring buffer boundary. */ - entry = lp->cur_tx & lp->tx_mod_mask; + /* Mask to ring buffer boundary. */ + entry = lp->cur_tx & lp->tx_mod_mask; - /* Caution: the write order is important here, set the status - * with the "ownership" bits last. */ + /* Caution: the write order is important here, set the status + * with the "ownership" bits last. */ - lp->tx_ring[entry].length = le16_to_cpu(-skb->len); + lp->tx_ring[entry].length = le16_to_cpu(-skb->len); - lp->tx_ring[entry].misc = 0x00000000; + lp->tx_ring[entry].misc = 0x00000000; - lp->tx_skbuff[entry] = skb; - lp->tx_dma_addr[entry] = pci_map_single(lp->pci_dev, skb->data, skb->len, - PCI_DMA_TODEVICE); - lp->tx_ring[entry].base = (u32)le32_to_cpu(lp->tx_dma_addr[entry]); - wmb(); /* Make sure owner changes after all others are visible */ - lp->tx_ring[entry].status = le16_to_cpu(status); + lp->tx_skbuff[entry] = skb; + lp->tx_dma_addr[entry] = + pci_map_single(lp->pci_dev, skb->data, skb->len, PCI_DMA_TODEVICE); + lp->tx_ring[entry].base = (u32) le32_to_cpu(lp->tx_dma_addr[entry]); + wmb(); /* Make sure owner changes after all others are visible */ + lp->tx_ring[entry].status = le16_to_cpu(status); - lp->cur_tx++; - lp->stats.tx_bytes += skb->len; + lp->cur_tx++; + lp->stats.tx_bytes += skb->len; - /* Trigger an immediate send poll. */ - lp->a.write_csr (ioaddr, 0, 0x0048); + /* Trigger an immediate send poll. */ + lp->a.write_csr(ioaddr, 0, 0x0048); - dev->trans_start = jiffies; + dev->trans_start = jiffies; - if (lp->tx_ring[(entry+1) & lp->tx_mod_mask].base != 0) { - lp->tx_full = 1; - netif_stop_queue(dev); - } - spin_unlock_irqrestore(&lp->lock, flags); - return 0; + if (lp->tx_ring[(entry + 1) & lp->tx_mod_mask].base != 0) { + lp->tx_full = 1; + netif_stop_queue(dev); + } + spin_unlock_irqrestore(&lp->lock, flags); + return 0; } /* The PCNET32 interrupt handler. */ static irqreturn_t -pcnet32_interrupt(int irq, void *dev_id, struct pt_regs * regs) +pcnet32_interrupt(int irq, void *dev_id, struct pt_regs *regs) { - struct net_device *dev = dev_id; - struct pcnet32_private *lp; - unsigned long ioaddr; - u16 csr0,rap; - int boguscnt = max_interrupt_work; - int must_restart; - - if (!dev) { - if (pcnet32_debug & NETIF_MSG_INTR) - printk (KERN_DEBUG "%s(): irq %d for unknown device\n", - __FUNCTION__, irq); - return IRQ_NONE; - } - - ioaddr = dev->base_addr; - lp = dev->priv; - - spin_lock(&lp->lock); - - rap = lp->a.read_rap(ioaddr); - while ((csr0 = lp->a.read_csr (ioaddr, 0)) & 0x8f00 && --boguscnt >= 0) { - if (csr0 == 0xffff) { - break; /* PCMCIA remove happened */ + struct net_device *dev = dev_id; + struct pcnet32_private *lp; + unsigned long ioaddr; + u16 csr0, rap; + int boguscnt = max_interrupt_work; + int must_restart; + + if (!dev) { + if (pcnet32_debug & NETIF_MSG_INTR) + printk(KERN_DEBUG "%s(): irq %d for unknown device\n", + __FUNCTION__, irq); + return IRQ_NONE; } - /* Acknowledge all of the current interrupt sources ASAP. */ - lp->a.write_csr (ioaddr, 0, csr0 & ~0x004f); - must_restart = 0; + ioaddr = dev->base_addr; + lp = dev->priv; - if (netif_msg_intr(lp)) - printk(KERN_DEBUG "%s: interrupt csr0=%#2.2x new csr=%#2.2x.\n", - dev->name, csr0, lp->a.read_csr (ioaddr, 0)); - - if (csr0 & 0x0400) /* Rx interrupt */ - pcnet32_rx(dev); - - if (csr0 & 0x0200) { /* Tx-done interrupt */ - unsigned int dirty_tx = lp->dirty_tx; - int delta; - - while (dirty_tx != lp->cur_tx) { - int entry = dirty_tx & lp->tx_mod_mask; - int status = (short)le16_to_cpu(lp->tx_ring[entry].status); - - if (status < 0) - break; /* It still hasn't been Txed */ - - lp->tx_ring[entry].base = 0; - - if (status & 0x4000) { - /* There was an major error, log it. */ - int err_status = le32_to_cpu(lp->tx_ring[entry].misc); - lp->stats.tx_errors++; - if (netif_msg_tx_err(lp)) - printk(KERN_ERR "%s: Tx error status=%04x err_status=%08x\n", - dev->name, status, err_status); - if (err_status & 0x04000000) lp->stats.tx_aborted_errors++; - if (err_status & 0x08000000) lp->stats.tx_carrier_errors++; - if (err_status & 0x10000000) lp->stats.tx_window_errors++; + spin_lock(&lp->lock); + + rap = lp->a.read_rap(ioaddr); + while ((csr0 = lp->a.read_csr(ioaddr, 0)) & 0x8f00 && --boguscnt >= 0) { + if (csr0 == 0xffff) { + break; /* PCMCIA remove happened */ + } + /* Acknowledge all of the current interrupt sources ASAP. */ + lp->a.write_csr(ioaddr, 0, csr0 & ~0x004f); + + must_restart = 0; + + if (netif_msg_intr(lp)) + printk(KERN_DEBUG + "%s: interrupt csr0=%#2.2x new csr=%#2.2x.\n", + dev->name, csr0, lp->a.read_csr(ioaddr, 0)); + + if (csr0 & 0x0400) /* Rx interrupt */ + pcnet32_rx(dev); + + if (csr0 & 0x0200) { /* Tx-done interrupt */ + unsigned int dirty_tx = lp->dirty_tx; + int delta; + + while (dirty_tx != lp->cur_tx) { + int entry = dirty_tx & lp->tx_mod_mask; + int status = + (short)le16_to_cpu(lp->tx_ring[entry]. + status); + + if (status < 0) + break; /* It still hasn't been Txed */ + + lp->tx_ring[entry].base = 0; + + if (status & 0x4000) { + /* There was an major error, log it. */ + int err_status = + le32_to_cpu(lp->tx_ring[entry]. + misc); + lp->stats.tx_errors++; + if (netif_msg_tx_err(lp)) + printk(KERN_ERR + "%s: Tx error status=%04x err_status=%08x\n", + dev->name, status, + err_status); + if (err_status & 0x04000000) + lp->stats.tx_aborted_errors++; + if (err_status & 0x08000000) + lp->stats.tx_carrier_errors++; + if (err_status & 0x10000000) + lp->stats.tx_window_errors++; #ifndef DO_DXSUFLO - if (err_status & 0x40000000) { - lp->stats.tx_fifo_errors++; - /* Ackk! On FIFO errors the Tx unit is turned off! */ - /* Remove this verbosity later! */ - if (netif_msg_tx_err(lp)) - printk(KERN_ERR "%s: Tx FIFO error! CSR0=%4.4x\n", - dev->name, csr0); - must_restart = 1; - } + if (err_status & 0x40000000) { + lp->stats.tx_fifo_errors++; + /* Ackk! On FIFO errors the Tx unit is turned off! */ + /* Remove this verbosity later! */ + if (netif_msg_tx_err(lp)) + printk(KERN_ERR + "%s: Tx FIFO error! CSR0=%4.4x\n", + dev->name, csr0); + must_restart = 1; + } #else - if (err_status & 0x40000000) { - lp->stats.tx_fifo_errors++; - if (! lp->dxsuflo) { /* If controller doesn't recover ... */ - /* Ackk! On FIFO errors the Tx unit is turned off! */ - /* Remove this verbosity later! */ - if (netif_msg_tx_err(lp)) - printk(KERN_ERR "%s: Tx FIFO error! CSR0=%4.4x\n", - dev->name, csr0); - must_restart = 1; - } - } + if (err_status & 0x40000000) { + lp->stats.tx_fifo_errors++; + if (!lp->dxsuflo) { /* If controller doesn't recover ... */ + /* Ackk! On FIFO errors the Tx unit is turned off! */ + /* Remove this verbosity later! */ + if (netif_msg_tx_err + (lp)) + printk(KERN_ERR + "%s: Tx FIFO error! CSR0=%4.4x\n", + dev-> + name, + csr0); + must_restart = 1; + } + } #endif - } else { - if (status & 0x1800) - lp->stats.collisions++; - lp->stats.tx_packets++; + } else { + if (status & 0x1800) + lp->stats.collisions++; + lp->stats.tx_packets++; + } + + /* We must free the original skb */ + if (lp->tx_skbuff[entry]) { + pci_unmap_single(lp->pci_dev, + lp->tx_dma_addr[entry], + lp->tx_skbuff[entry]-> + len, PCI_DMA_TODEVICE); + dev_kfree_skb_irq(lp->tx_skbuff[entry]); + lp->tx_skbuff[entry] = NULL; + lp->tx_dma_addr[entry] = 0; + } + dirty_tx++; + } + + delta = + (lp->cur_tx - dirty_tx) & (lp->tx_mod_mask + + lp->tx_ring_size); + if (delta > lp->tx_ring_size) { + if (netif_msg_drv(lp)) + printk(KERN_ERR + "%s: out-of-sync dirty pointer, %d vs. %d, full=%d.\n", + dev->name, dirty_tx, lp->cur_tx, + lp->tx_full); + dirty_tx += lp->tx_ring_size; + delta -= lp->tx_ring_size; + } + + if (lp->tx_full && + netif_queue_stopped(dev) && + delta < lp->tx_ring_size - 2) { + /* The ring is no longer full, clear tbusy. */ + lp->tx_full = 0; + netif_wake_queue(dev); + } + lp->dirty_tx = dirty_tx; + } + + /* Log misc errors. */ + if (csr0 & 0x4000) + lp->stats.tx_errors++; /* Tx babble. */ + if (csr0 & 0x1000) { + /* + * this happens when our receive ring is full. This shouldn't + * be a problem as we will see normal rx interrupts for the frames + * in the receive ring. But there are some PCI chipsets (I can + * reproduce this on SP3G with Intel saturn chipset) which have + * sometimes problems and will fill up the receive ring with + * error descriptors. In this situation we don't get a rx + * interrupt, but a missed frame interrupt sooner or later. + * So we try to clean up our receive ring here. + */ + pcnet32_rx(dev); + lp->stats.rx_errors++; /* Missed a Rx frame. */ + } + if (csr0 & 0x0800) { + if (netif_msg_drv(lp)) + printk(KERN_ERR + "%s: Bus master arbitration failure, status %4.4x.\n", + dev->name, csr0); + /* unlike for the lance, there is no restart needed */ } - /* We must free the original skb */ - if (lp->tx_skbuff[entry]) { - pci_unmap_single(lp->pci_dev, lp->tx_dma_addr[entry], - lp->tx_skbuff[entry]->len, PCI_DMA_TODEVICE); - dev_kfree_skb_irq(lp->tx_skbuff[entry]); - lp->tx_skbuff[entry] = NULL; - lp->tx_dma_addr[entry] = 0; + if (must_restart) { + /* reset the chip to clear the error condition, then restart */ + lp->a.reset(ioaddr); + lp->a.write_csr(ioaddr, 4, 0x0915); + pcnet32_restart(dev, 0x0002); + netif_wake_queue(dev); } - dirty_tx++; - } - - delta = (lp->cur_tx - dirty_tx) & (lp->tx_mod_mask + lp->tx_ring_size); - if (delta > lp->tx_ring_size) { - if (netif_msg_drv(lp)) - printk(KERN_ERR "%s: out-of-sync dirty pointer, %d vs. %d, full=%d.\n", - dev->name, dirty_tx, lp->cur_tx, lp->tx_full); - dirty_tx += lp->tx_ring_size; - delta -= lp->tx_ring_size; - } - - if (lp->tx_full && - netif_queue_stopped(dev) && - delta < lp->tx_ring_size - 2) { - /* The ring is no longer full, clear tbusy. */ - lp->tx_full = 0; - netif_wake_queue (dev); - } - lp->dirty_tx = dirty_tx; - } - - /* Log misc errors. */ - if (csr0 & 0x4000) lp->stats.tx_errors++; /* Tx babble. */ - if (csr0 & 0x1000) { - /* - * this happens when our receive ring is full. This shouldn't - * be a problem as we will see normal rx interrupts for the frames - * in the receive ring. But there are some PCI chipsets (I can - * reproduce this on SP3G with Intel saturn chipset) which have - * sometimes problems and will fill up the receive ring with - * error descriptors. In this situation we don't get a rx - * interrupt, but a missed frame interrupt sooner or later. - * So we try to clean up our receive ring here. - */ - pcnet32_rx(dev); - lp->stats.rx_errors++; /* Missed a Rx frame. */ - } - if (csr0 & 0x0800) { - if (netif_msg_drv(lp)) - printk(KERN_ERR "%s: Bus master arbitration failure, status %4.4x.\n", - dev->name, csr0); - /* unlike for the lance, there is no restart needed */ - } - - if (must_restart) { - /* reset the chip to clear the error condition, then restart */ - lp->a.reset(ioaddr); - lp->a.write_csr(ioaddr, 4, 0x0915); - pcnet32_restart(dev, 0x0002); - netif_wake_queue(dev); - } - } - - /* Set interrupt enable. */ - lp->a.write_csr (ioaddr, 0, 0x0040); - lp->a.write_rap (ioaddr,rap); - - if (netif_msg_intr(lp)) - printk(KERN_DEBUG "%s: exiting interrupt, csr0=%#4.4x.\n", - dev->name, lp->a.read_csr (ioaddr, 0)); - - spin_unlock(&lp->lock); - - return IRQ_HANDLED; + } + + /* Set interrupt enable. */ + lp->a.write_csr(ioaddr, 0, 0x0040); + lp->a.write_rap(ioaddr, rap); + + if (netif_msg_intr(lp)) + printk(KERN_DEBUG "%s: exiting interrupt, csr0=%#4.4x.\n", + dev->name, lp->a.read_csr(ioaddr, 0)); + + spin_unlock(&lp->lock); + + return IRQ_HANDLED; } -static int -pcnet32_rx(struct net_device *dev) +static int pcnet32_rx(struct net_device *dev) { - struct pcnet32_private *lp = dev->priv; - int entry = lp->cur_rx & lp->rx_mod_mask; - int boguscnt = lp->rx_ring_size / 2; - - /* If we own the next entry, it's a new packet. Send it up. */ - while ((short)le16_to_cpu(lp->rx_ring[entry].status) >= 0) { - int status = (short)le16_to_cpu(lp->rx_ring[entry].status) >> 8; - - if (status != 0x03) { /* There was an error. */ - /* - * There is a tricky error noted by John Murphy, - * <murf@perftech.com> to Russ Nelson: Even with full-sized - * buffers it's possible for a jabber packet to use two - * buffers, with only the last correctly noting the error. - */ - if (status & 0x01) /* Only count a general error at the */ - lp->stats.rx_errors++; /* end of a packet.*/ - if (status & 0x20) lp->stats.rx_frame_errors++; - if (status & 0x10) lp->stats.rx_over_errors++; - if (status & 0x08) lp->stats.rx_crc_errors++; - if (status & 0x04) lp->stats.rx_fifo_errors++; - lp->rx_ring[entry].status &= le16_to_cpu(0x03ff); - } else { - /* Malloc up new buffer, compatible with net-2e. */ - short pkt_len = (le32_to_cpu(lp->rx_ring[entry].msg_length) & 0xfff)-4; - struct sk_buff *skb; - - /* Discard oversize frames. */ - if (unlikely(pkt_len > PKT_BUF_SZ - 2)) { - if (netif_msg_drv(lp)) - printk(KERN_ERR "%s: Impossible packet size %d!\n", - dev->name, pkt_len); - lp->stats.rx_errors++; - } else if (pkt_len < 60) { - if (netif_msg_rx_err(lp)) - printk(KERN_ERR "%s: Runt packet!\n", dev->name); - lp->stats.rx_errors++; - } else { - int rx_in_place = 0; - - if (pkt_len > rx_copybreak) { - struct sk_buff *newskb; - - if ((newskb = dev_alloc_skb(PKT_BUF_SZ))) { - skb_reserve (newskb, 2); - skb = lp->rx_skbuff[entry]; - pci_unmap_single(lp->pci_dev, lp->rx_dma_addr[entry], - PKT_BUF_SZ-2, PCI_DMA_FROMDEVICE); - skb_put (skb, pkt_len); - lp->rx_skbuff[entry] = newskb; - newskb->dev = dev; - lp->rx_dma_addr[entry] = - pci_map_single(lp->pci_dev, newskb->data, - PKT_BUF_SZ-2, PCI_DMA_FROMDEVICE); - lp->rx_ring[entry].base = le32_to_cpu(lp->rx_dma_addr[entry]); - rx_in_place = 1; - } else - skb = NULL; + struct pcnet32_private *lp = dev->priv; + int entry = lp->cur_rx & lp->rx_mod_mask; + int boguscnt = lp->rx_ring_size / 2; + + /* If we own the next entry, it's a new packet. Send it up. */ + while ((short)le16_to_cpu(lp->rx_ring[entry].status) >= 0) { + int status = (short)le16_to_cpu(lp->rx_ring[entry].status) >> 8; + + if (status != 0x03) { /* There was an error. */ + /* + * There is a tricky error noted by John Murphy, + * <murf@perftech.com> to Russ Nelson: Even with full-sized + * buffers it's possible for a jabber packet to use two + * buffers, with only the last correctly noting the error. + */ + if (status & 0x01) /* Only count a general error at the */ + lp->stats.rx_errors++; /* end of a packet. */ + if (status & 0x20) + lp->stats.rx_frame_errors++; + if (status & 0x10) + lp->stats.rx_over_errors++; + if (status & 0x08) + lp->stats.rx_crc_errors++; + if (status & 0x04) + lp->stats.rx_fifo_errors++; + lp->rx_ring[entry].status &= le16_to_cpu(0x03ff); } else { - skb = dev_alloc_skb(pkt_len+2); - } - - if (skb == NULL) { - int i; - if (netif_msg_drv(lp)) - printk(KERN_ERR "%s: Memory squeeze, deferring packet.\n", - dev->name); - for (i = 0; i < lp->rx_ring_size; i++) - if ((short)le16_to_cpu(lp->rx_ring[(entry+i) - & lp->rx_mod_mask].status) < 0) - break; - - if (i > lp->rx_ring_size -2) { - lp->stats.rx_dropped++; - lp->rx_ring[entry].status |= le16_to_cpu(0x8000); - wmb(); /* Make sure adapter sees owner change */ - lp->cur_rx++; - } - break; - } - skb->dev = dev; - if (!rx_in_place) { - skb_reserve(skb,2); /* 16 byte align */ - skb_put(skb,pkt_len); /* Make room */ - pci_dma_sync_single_for_cpu(lp->pci_dev, - lp->rx_dma_addr[entry], - PKT_BUF_SZ-2, - PCI_DMA_FROMDEVICE); - eth_copy_and_sum(skb, - (unsigned char *)(lp->rx_skbuff[entry]->data), - pkt_len,0); - pci_dma_sync_single_for_device(lp->pci_dev, - lp->rx_dma_addr[entry], - PKT_BUF_SZ-2, - PCI_DMA_FROMDEVICE); + /* Malloc up new buffer, compatible with net-2e. */ + short pkt_len = + (le32_to_cpu(lp->rx_ring[entry].msg_length) & 0xfff) + - 4; + struct sk_buff *skb; + + /* Discard oversize frames. */ + if (unlikely(pkt_len > PKT_BUF_SZ - 2)) { + if (netif_msg_drv(lp)) + printk(KERN_ERR + "%s: Impossible packet size %d!\n", + dev->name, pkt_len); + lp->stats.rx_errors++; + } else if (pkt_len < 60) { + if (netif_msg_rx_err(lp)) + printk(KERN_ERR "%s: Runt packet!\n", + dev->name); + lp->stats.rx_errors++; + } else { + int rx_in_place = 0; + + if (pkt_len > rx_copybreak) { + struct sk_buff *newskb; + + if ((newskb = + dev_alloc_skb(PKT_BUF_SZ))) { + skb_reserve(newskb, 2); + skb = lp->rx_skbuff[entry]; + pci_unmap_single(lp->pci_dev, + lp-> + rx_dma_addr + [entry], + PKT_BUF_SZ - 2, + PCI_DMA_FROMDEVICE); + skb_put(skb, pkt_len); + lp->rx_skbuff[entry] = newskb; + newskb->dev = dev; + lp->rx_dma_addr[entry] = + pci_map_single(lp->pci_dev, + newskb->data, + PKT_BUF_SZ - + 2, + PCI_DMA_FROMDEVICE); + lp->rx_ring[entry].base = + le32_to_cpu(lp-> + rx_dma_addr + [entry]); + rx_in_place = 1; + } else + skb = NULL; + } else { + skb = dev_alloc_skb(pkt_len + 2); + } + + if (skb == NULL) { + int i; + if (netif_msg_drv(lp)) + printk(KERN_ERR + "%s: Memory squeeze, deferring packet.\n", + dev->name); + for (i = 0; i < lp->rx_ring_size; i++) + if ((short) + le16_to_cpu(lp-> + rx_ring[(entry + + i) + & lp-> + rx_mod_mask]. + status) < 0) + break; + + if (i > lp->rx_ring_size - 2) { + lp->stats.rx_dropped++; + lp->rx_ring[entry].status |= + le16_to_cpu(0x8000); + wmb(); /* Make sure adapter sees owner change */ + lp->cur_rx++; + } + break; + } + skb->dev = dev; + if (!rx_in_place) { + skb_reserve(skb, 2); /* 16 byte align */ + skb_put(skb, pkt_len); /* Make room */ + pci_dma_sync_single_for_cpu(lp->pci_dev, + lp-> + rx_dma_addr + [entry], + PKT_BUF_SZ - + 2, + PCI_DMA_FROMDEVICE); + eth_copy_and_sum(skb, + (unsigned char *)(lp-> + rx_skbuff + [entry]-> + data), + pkt_len, 0); + pci_dma_sync_single_for_device(lp-> + pci_dev, + lp-> + rx_dma_addr + [entry], + PKT_BUF_SZ + - 2, + PCI_DMA_FROMDEVICE); + } + lp->stats.rx_bytes += skb->len; + skb->protocol = eth_type_trans(skb, dev); + netif_rx(skb); + dev->last_rx = jiffies; + lp->stats.rx_packets++; + } } - lp->stats.rx_bytes += skb->len; - skb->protocol=eth_type_trans(skb,dev); - netif_rx(skb); - dev->last_rx = jiffies; - lp->stats.rx_packets++; - } + /* + * The docs say that the buffer length isn't touched, but Andrew Boyd + * of QNX reports that some revs of the 79C965 clear it. + */ + lp->rx_ring[entry].buf_length = le16_to_cpu(2 - PKT_BUF_SZ); + wmb(); /* Make sure owner changes after all others are visible */ + lp->rx_ring[entry].status |= le16_to_cpu(0x8000); + entry = (++lp->cur_rx) & lp->rx_mod_mask; + if (--boguscnt <= 0) + break; /* don't stay in loop forever */ } - /* - * The docs say that the buffer length isn't touched, but Andrew Boyd - * of QNX reports that some revs of the 79C965 clear it. - */ - lp->rx_ring[entry].buf_length = le16_to_cpu(2-PKT_BUF_SZ); - wmb(); /* Make sure owner changes after all others are visible */ - lp->rx_ring[entry].status |= le16_to_cpu(0x8000); - entry = (++lp->cur_rx) & lp->rx_mod_mask; - if (--boguscnt <= 0) break; /* don't stay in loop forever */ - } - - return 0; + + return 0; } -static int -pcnet32_close(struct net_device *dev) +static int pcnet32_close(struct net_device *dev) { - unsigned long ioaddr = dev->base_addr; - struct pcnet32_private *lp = dev->priv; - int i; - unsigned long flags; + unsigned long ioaddr = dev->base_addr; + struct pcnet32_private *lp = dev->priv; + int i; + unsigned long flags; - del_timer_sync(&lp->watchdog_timer); + del_timer_sync(&lp->watchdog_timer); - netif_stop_queue(dev); + netif_stop_queue(dev); - spin_lock_irqsave(&lp->lock, flags); + spin_lock_irqsave(&lp->lock, flags); - lp->stats.rx_missed_errors = lp->a.read_csr (ioaddr, 112); + lp->stats.rx_missed_errors = lp->a.read_csr(ioaddr, 112); - if (netif_msg_ifdown(lp)) - printk(KERN_DEBUG "%s: Shutting down ethercard, status was %2.2x.\n", - dev->name, lp->a.read_csr (ioaddr, 0)); + if (netif_msg_ifdown(lp)) + printk(KERN_DEBUG + "%s: Shutting down ethercard, status was %2.2x.\n", + dev->name, lp->a.read_csr(ioaddr, 0)); - /* We stop the PCNET32 here -- it occasionally polls memory if we don't. */ - lp->a.write_csr (ioaddr, 0, 0x0004); + /* We stop the PCNET32 here -- it occasionally polls memory if we don't. */ + lp->a.write_csr(ioaddr, 0, 0x0004); - /* - * Switch back to 16bit mode to avoid problems with dumb - * DOS packet driver after a warm reboot - */ - lp->a.write_bcr (ioaddr, 20, 4); + /* + * Switch back to 16bit mode to avoid problems with dumb + * DOS packet driver after a warm reboot + */ + lp->a.write_bcr(ioaddr, 20, 4); - spin_unlock_irqrestore(&lp->lock, flags); + spin_unlock_irqrestore(&lp->lock, flags); - free_irq(dev->irq, dev); + free_irq(dev->irq, dev); - spin_lock_irqsave(&lp->lock, flags); + spin_lock_irqsave(&lp->lock, flags); - /* free all allocated skbuffs */ - for (i = 0; i < lp->rx_ring_size; i++) { - lp->rx_ring[i].status = 0; - wmb(); /* Make sure adapter sees owner change */ - if (lp->rx_skbuff[i]) { - pci_unmap_single(lp->pci_dev, lp->rx_dma_addr[i], PKT_BUF_SZ-2, - PCI_DMA_FROMDEVICE); - dev_kfree_skb(lp->rx_skbuff[i]); + /* free all allocated skbuffs */ + for (i = 0; i < lp->rx_ring_size; i++) { + lp->rx_ring[i].status = 0; + wmb(); /* Make sure adapter sees owner change */ + if (lp->rx_skbuff[i]) { + pci_unmap_single(lp->pci_dev, lp->rx_dma_addr[i], + PKT_BUF_SZ - 2, PCI_DMA_FROMDEVICE); + dev_kfree_skb(lp->rx_skbuff[i]); + } + lp->rx_skbuff[i] = NULL; + lp->rx_dma_addr[i] = 0; } - lp->rx_skbuff[i] = NULL; - lp->rx_dma_addr[i] = 0; - } - for (i = 0; i < lp->tx_ring_size; i++) { - lp->tx_ring[i].status = 0; /* CPU owns buffer */ - wmb(); /* Make sure adapter sees owner change */ - if (lp->tx_skbuff[i]) { - pci_unmap_single(lp->pci_dev, lp->tx_dma_addr[i], - lp->tx_skbuff[i]->len, PCI_DMA_TODEVICE); - dev_kfree_skb(lp->tx_skbuff[i]); + for (i = 0; i < lp->tx_ring_size; i++) { + lp->tx_ring[i].status = 0; /* CPU owns buffer */ + wmb(); /* Make sure adapter sees owner change */ + if (lp->tx_skbuff[i]) { + pci_unmap_single(lp->pci_dev, lp->tx_dma_addr[i], + lp->tx_skbuff[i]->len, + PCI_DMA_TODEVICE); + dev_kfree_skb(lp->tx_skbuff[i]); + } + lp->tx_skbuff[i] = NULL; + lp->tx_dma_addr[i] = 0; } - lp->tx_skbuff[i] = NULL; - lp->tx_dma_addr[i] = 0; - } - spin_unlock_irqrestore(&lp->lock, flags); + spin_unlock_irqrestore(&lp->lock, flags); - return 0; + return 0; } -static struct net_device_stats * -pcnet32_get_stats(struct net_device *dev) +static struct net_device_stats *pcnet32_get_stats(struct net_device *dev) { - struct pcnet32_private *lp = dev->priv; - unsigned long ioaddr = dev->base_addr; - u16 saved_addr; - unsigned long flags; - - spin_lock_irqsave(&lp->lock, flags); - saved_addr = lp->a.read_rap(ioaddr); - lp->stats.rx_missed_errors = lp->a.read_csr (ioaddr, 112); - lp->a.write_rap(ioaddr, saved_addr); - spin_unlock_irqrestore(&lp->lock, flags); - - return &lp->stats; + struct pcnet32_private *lp = dev->priv; + unsigned long ioaddr = dev->base_addr; + u16 saved_addr; + unsigned long flags; + + spin_lock_irqsave(&lp->lock, flags); + saved_addr = lp->a.read_rap(ioaddr); + lp->stats.rx_missed_errors = lp->a.read_csr(ioaddr, 112); + lp->a.write_rap(ioaddr, saved_addr); + spin_unlock_irqrestore(&lp->lock, flags); + + return &lp->stats; } /* taken from the sunlance driver, which it took from the depca driver */ -static void pcnet32_load_multicast (struct net_device *dev) +static void pcnet32_load_multicast(struct net_device *dev) { - struct pcnet32_private *lp = dev->priv; - volatile struct pcnet32_init_block *ib = &lp->init_block; - volatile u16 *mcast_table = (u16 *)&ib->filter; - struct dev_mc_list *dmi=dev->mc_list; - char *addrs; - int i; - u32 crc; - - /* set all multicast bits */ - if (dev->flags & IFF_ALLMULTI) { - ib->filter[0] = 0xffffffff; - ib->filter[1] = 0xffffffff; + struct pcnet32_private *lp = dev->priv; + volatile struct pcnet32_init_block *ib = &lp->init_block; + volatile u16 *mcast_table = (u16 *) & ib->filter; + struct dev_mc_list *dmi = dev->mc_list; + char *addrs; + int i; + u32 crc; + + /* set all multicast bits */ + if (dev->flags & IFF_ALLMULTI) { + ib->filter[0] = 0xffffffff; + ib->filter[1] = 0xffffffff; + return; + } + /* clear the multicast filter */ + ib->filter[0] = 0; + ib->filter[1] = 0; + + /* Add addresses */ + for (i = 0; i < dev->mc_count; i++) { + addrs = dmi->dmi_addr; + dmi = dmi->next; + + /* multicast address? */ + if (!(*addrs & 1)) + continue; + + crc = ether_crc_le(6, addrs); + crc = crc >> 26; + mcast_table[crc >> 4] = + le16_to_cpu(le16_to_cpu(mcast_table[crc >> 4]) | + (1 << (crc & 0xf))); + } return; - } - /* clear the multicast filter */ - ib->filter[0] = 0; - ib->filter[1] = 0; - - /* Add addresses */ - for (i = 0; i < dev->mc_count; i++) { - addrs = dmi->dmi_addr; - dmi = dmi->next; - - /* multicast address? */ - if (!(*addrs & 1)) - continue; - - crc = ether_crc_le(6, addrs); - crc = crc >> 26; - mcast_table [crc >> 4] = le16_to_cpu( - le16_to_cpu(mcast_table [crc >> 4]) | (1 << (crc & 0xf))); - } - return; } - /* * Set or clear the multicast filter for this adaptor. */ static void pcnet32_set_multicast_list(struct net_device *dev) { - unsigned long ioaddr = dev->base_addr, flags; - struct pcnet32_private *lp = dev->priv; - - spin_lock_irqsave(&lp->lock, flags); - if (dev->flags&IFF_PROMISC) { - /* Log any net taps. */ - if (netif_msg_hw(lp)) - printk(KERN_INFO "%s: Promiscuous mode enabled.\n", dev->name); - lp->init_block.mode = le16_to_cpu(0x8000 | (lp->options & PCNET32_PORT_PORTSEL) << 7); - } else { - lp->init_block.mode = le16_to_cpu((lp->options & PCNET32_PORT_PORTSEL) << 7); - pcnet32_load_multicast (dev); - } - - lp->a.write_csr (ioaddr, 0, 0x0004); /* Temporarily stop the lance. */ - pcnet32_restart(dev, 0x0042); /* Resume normal operation */ - netif_wake_queue(dev); - - spin_unlock_irqrestore(&lp->lock, flags); + unsigned long ioaddr = dev->base_addr, flags; + struct pcnet32_private *lp = dev->priv; + + spin_lock_irqsave(&lp->lock, flags); + if (dev->flags & IFF_PROMISC) { + /* Log any net taps. */ + if (netif_msg_hw(lp)) + printk(KERN_INFO "%s: Promiscuous mode enabled.\n", + dev->name); + lp->init_block.mode = + le16_to_cpu(0x8000 | (lp->options & PCNET32_PORT_PORTSEL) << + 7); + } else { + lp->init_block.mode = + le16_to_cpu((lp->options & PCNET32_PORT_PORTSEL) << 7); + pcnet32_load_multicast(dev); + } + + lp->a.write_csr(ioaddr, 0, 0x0004); /* Temporarily stop the lance. */ + pcnet32_restart(dev, 0x0042); /* Resume normal operation */ + netif_wake_queue(dev); + + spin_unlock_irqrestore(&lp->lock, flags); } /* This routine assumes that the lp->lock is held */ static int mdio_read(struct net_device *dev, int phy_id, int reg_num) { - struct pcnet32_private *lp = dev->priv; - unsigned long ioaddr = dev->base_addr; - u16 val_out; + struct pcnet32_private *lp = dev->priv; + unsigned long ioaddr = dev->base_addr; + u16 val_out; - if (!lp->mii) - return 0; + if (!lp->mii) + return 0; - lp->a.write_bcr(ioaddr, 33, ((phy_id & 0x1f) << 5) | (reg_num & 0x1f)); - val_out = lp->a.read_bcr(ioaddr, 34); + lp->a.write_bcr(ioaddr, 33, ((phy_id & 0x1f) << 5) | (reg_num & 0x1f)); + val_out = lp->a.read_bcr(ioaddr, 34); - return val_out; + return val_out; } /* This routine assumes that the lp->lock is held */ static void mdio_write(struct net_device *dev, int phy_id, int reg_num, int val) { - struct pcnet32_private *lp = dev->priv; - unsigned long ioaddr = dev->base_addr; + struct pcnet32_private *lp = dev->priv; + unsigned long ioaddr = dev->base_addr; - if (!lp->mii) - return; + if (!lp->mii) + return; - lp->a.write_bcr(ioaddr, 33, ((phy_id & 0x1f) << 5) | (reg_num & 0x1f)); - lp->a.write_bcr(ioaddr, 34, val); + lp->a.write_bcr(ioaddr, 33, ((phy_id & 0x1f) << 5) | (reg_num & 0x1f)); + lp->a.write_bcr(ioaddr, 34, val); } static int pcnet32_ioctl(struct net_device *dev, struct ifreq *rq, int cmd) { - struct pcnet32_private *lp = dev->priv; - int rc; - unsigned long flags; + struct pcnet32_private *lp = dev->priv; + int rc; + unsigned long flags; + + /* SIOC[GS]MIIxxx ioctls */ + if (lp->mii) { + spin_lock_irqsave(&lp->lock, flags); + rc = generic_mii_ioctl(&lp->mii_if, if_mii(rq), cmd, NULL); + spin_unlock_irqrestore(&lp->lock, flags); + } else { + rc = -EOPNOTSUPP; + } + + return rc; +} + +static int pcnet32_check_otherphy(struct net_device *dev) +{ + struct pcnet32_private *lp = dev->priv; + struct mii_if_info mii = lp->mii_if; + u16 bmcr; + int i; - /* SIOC[GS]MIIxxx ioctls */ - if (lp->mii) { - spin_lock_irqsave(&lp->lock, flags); - rc = generic_mii_ioctl(&lp->mii_if, if_mii(rq), cmd, NULL); - spin_unlock_irqrestore(&lp->lock, flags); - } else { - rc = -EOPNOTSUPP; - } + for (i = 0; i < PCNET32_MAX_PHYS; i++) { + if (i == lp->mii_if.phy_id) + continue; /* skip active phy */ + if (lp->phymask & (1 << i)) { + mii.phy_id = i; + if (mii_link_ok(&mii)) { + /* found PHY with active link */ + if (netif_msg_link(lp)) + printk(KERN_INFO + "%s: Using PHY number %d.\n", + dev->name, i); + + /* isolate inactive phy */ + bmcr = + mdio_read(dev, lp->mii_if.phy_id, MII_BMCR); + mdio_write(dev, lp->mii_if.phy_id, MII_BMCR, + bmcr | BMCR_ISOLATE); + + /* de-isolate new phy */ + bmcr = mdio_read(dev, i, MII_BMCR); + mdio_write(dev, i, MII_BMCR, + bmcr & ~BMCR_ISOLATE); + + /* set new phy address */ + lp->mii_if.phy_id = i; + return 1; + } + } + } + return 0; +} + +/* + * Show the status of the media. Similar to mii_check_media however it + * correctly shows the link speed for all (tested) pcnet32 variants. + * Devices with no mii just report link state without speed. + * + * Caller is assumed to hold and release the lp->lock. + */ - return rc; +static void pcnet32_check_media(struct net_device *dev, int verbose) +{ + struct pcnet32_private *lp = dev->priv; + int curr_link; + int prev_link = netif_carrier_ok(dev) ? 1 : 0; + u32 bcr9; + + if (lp->mii) { + curr_link = mii_link_ok(&lp->mii_if); + } else { + ulong ioaddr = dev->base_addr; /* card base I/O address */ + curr_link = (lp->a.read_bcr(ioaddr, 4) != 0xc0); + } + if (!curr_link) { + if (prev_link || verbose) { + netif_carrier_off(dev); + if (netif_msg_link(lp)) + printk(KERN_INFO "%s: link down\n", dev->name); + } + if (lp->phycount > 1) { + curr_link = pcnet32_check_otherphy(dev); + prev_link = 0; + } + } else if (verbose || !prev_link) { + netif_carrier_on(dev); + if (lp->mii) { + if (netif_msg_link(lp)) { + struct ethtool_cmd ecmd; + mii_ethtool_gset(&lp->mii_if, &ecmd); + printk(KERN_INFO + "%s: link up, %sMbps, %s-duplex\n", + dev->name, + (ecmd.speed == SPEED_100) ? "100" : "10", + (ecmd.duplex == + DUPLEX_FULL) ? "full" : "half"); + } + bcr9 = lp->a.read_bcr(dev->base_addr, 9); + if ((bcr9 & (1 << 0)) != lp->mii_if.full_duplex) { + if (lp->mii_if.full_duplex) + bcr9 |= (1 << 0); + else + bcr9 &= ~(1 << 0); + lp->a.write_bcr(dev->base_addr, 9, bcr9); + } + } else { + if (netif_msg_link(lp)) + printk(KERN_INFO "%s: link up\n", dev->name); + } + } } +/* + * Check for loss of link and link establishment. + * Can not use mii_check_media because it does nothing if mode is forced. + */ + static void pcnet32_watchdog(struct net_device *dev) { - struct pcnet32_private *lp = dev->priv; - unsigned long flags; + struct pcnet32_private *lp = dev->priv; + unsigned long flags; - /* Print the link status if it has changed */ - if (lp->mii) { + /* Print the link status if it has changed */ spin_lock_irqsave(&lp->lock, flags); - mii_check_media (&lp->mii_if, netif_msg_link(lp), 0); + pcnet32_check_media(dev, 0); spin_unlock_irqrestore(&lp->lock, flags); - } - mod_timer (&(lp->watchdog_timer), PCNET32_WATCHDOG_TIMEOUT); + mod_timer(&(lp->watchdog_timer), PCNET32_WATCHDOG_TIMEOUT); } static void __devexit pcnet32_remove_one(struct pci_dev *pdev) { - struct net_device *dev = pci_get_drvdata(pdev); - - if (dev) { - struct pcnet32_private *lp = dev->priv; - - unregister_netdev(dev); - pcnet32_free_ring(dev); - release_region(dev->base_addr, PCNET32_TOTAL_SIZE); - pci_free_consistent(lp->pci_dev, sizeof(*lp), lp, lp->dma_addr); - free_netdev(dev); - pci_disable_device(pdev); - pci_set_drvdata(pdev, NULL); - } + struct net_device *dev = pci_get_drvdata(pdev); + + if (dev) { + struct pcnet32_private *lp = dev->priv; + + unregister_netdev(dev); + pcnet32_free_ring(dev); + release_region(dev->base_addr, PCNET32_TOTAL_SIZE); + pci_free_consistent(lp->pci_dev, sizeof(*lp), lp, lp->dma_addr); + free_netdev(dev); + pci_disable_device(pdev); + pci_set_drvdata(pdev, NULL); + } } static struct pci_driver pcnet32_driver = { - .name = DRV_NAME, - .probe = pcnet32_probe_pci, - .remove = __devexit_p(pcnet32_remove_one), - .id_table = pcnet32_pci_tbl, + .name = DRV_NAME, + .probe = pcnet32_probe_pci, + .remove = __devexit_p(pcnet32_remove_one), + .id_table = pcnet32_pci_tbl, }; /* An additional parameter that may be passed in... */ @@ -2477,9 +2698,11 @@ static int pcnet32_have_pci; module_param(debug, int, 0); MODULE_PARM_DESC(debug, DRV_NAME " debug level"); module_param(max_interrupt_work, int, 0); -MODULE_PARM_DESC(max_interrupt_work, DRV_NAME " maximum events handled per interrupt"); +MODULE_PARM_DESC(max_interrupt_work, + DRV_NAME " maximum events handled per interrupt"); module_param(rx_copybreak, int, 0); -MODULE_PARM_DESC(rx_copybreak, DRV_NAME " copy breakpoint for copy-only-tiny-frames"); +MODULE_PARM_DESC(rx_copybreak, + DRV_NAME " copy breakpoint for copy-only-tiny-frames"); module_param(tx_start_pt, int, 0); MODULE_PARM_DESC(tx_start_pt, DRV_NAME " transmit start point (0-3)"); module_param(pcnet32vlb, int, 0); @@ -2490,7 +2713,9 @@ module_param_array(full_duplex, int, NULL, 0); MODULE_PARM_DESC(full_duplex, DRV_NAME " full duplex setting(s) (1)"); /* Module Parameter for HomePNA cards added by Patrick Simmons, 2004 */ module_param_array(homepna, int, NULL, 0); -MODULE_PARM_DESC(homepna, DRV_NAME " mode for 79C978 cards (1 for HomePNA, 0 for Ethernet, default Ethernet"); +MODULE_PARM_DESC(homepna, + DRV_NAME + " mode for 79C978 cards (1 for HomePNA, 0 for Ethernet, default Ethernet"); MODULE_AUTHOR("Thomas Bogendoerfer"); MODULE_DESCRIPTION("Driver for PCnet32 and PCnetPCI based ethercards"); @@ -2500,44 +2725,44 @@ MODULE_LICENSE("GPL"); static int __init pcnet32_init_module(void) { - printk(KERN_INFO "%s", version); + printk(KERN_INFO "%s", version); - pcnet32_debug = netif_msg_init(debug, PCNET32_MSG_DEFAULT); + pcnet32_debug = netif_msg_init(debug, PCNET32_MSG_DEFAULT); - if ((tx_start_pt >= 0) && (tx_start_pt <= 3)) - tx_start = tx_start_pt; + if ((tx_start_pt >= 0) && (tx_start_pt <= 3)) + tx_start = tx_start_pt; - /* find the PCI devices */ - if (!pci_module_init(&pcnet32_driver)) - pcnet32_have_pci = 1; + /* find the PCI devices */ + if (!pci_module_init(&pcnet32_driver)) + pcnet32_have_pci = 1; - /* should we find any remaining VLbus devices ? */ - if (pcnet32vlb) - pcnet32_probe_vlbus(); + /* should we find any remaining VLbus devices ? */ + if (pcnet32vlb) + pcnet32_probe_vlbus(); - if (cards_found && (pcnet32_debug & NETIF_MSG_PROBE)) - printk(KERN_INFO PFX "%d cards_found.\n", cards_found); + if (cards_found && (pcnet32_debug & NETIF_MSG_PROBE)) + printk(KERN_INFO PFX "%d cards_found.\n", cards_found); - return (pcnet32_have_pci + cards_found) ? 0 : -ENODEV; + return (pcnet32_have_pci + cards_found) ? 0 : -ENODEV; } static void __exit pcnet32_cleanup_module(void) { - struct net_device *next_dev; - - while (pcnet32_dev) { - struct pcnet32_private *lp = pcnet32_dev->priv; - next_dev = lp->next; - unregister_netdev(pcnet32_dev); - pcnet32_free_ring(pcnet32_dev); - release_region(pcnet32_dev->base_addr, PCNET32_TOTAL_SIZE); - pci_free_consistent(lp->pci_dev, sizeof(*lp), lp, lp->dma_addr); - free_netdev(pcnet32_dev); - pcnet32_dev = next_dev; - } + struct net_device *next_dev; + + while (pcnet32_dev) { + struct pcnet32_private *lp = pcnet32_dev->priv; + next_dev = lp->next; + unregister_netdev(pcnet32_dev); + pcnet32_free_ring(pcnet32_dev); + release_region(pcnet32_dev->base_addr, PCNET32_TOTAL_SIZE); + pci_free_consistent(lp->pci_dev, sizeof(*lp), lp, lp->dma_addr); + free_netdev(pcnet32_dev); + pcnet32_dev = next_dev; + } - if (pcnet32_have_pci) - pci_unregister_driver(&pcnet32_driver); + if (pcnet32_have_pci) + pci_unregister_driver(&pcnet32_driver); } module_init(pcnet32_init_module); diff --git a/drivers/net/skfp/fplustm.c b/drivers/net/skfp/fplustm.c index a4b2b6975d6..0784f558ca9 100644 --- a/drivers/net/skfp/fplustm.c +++ b/drivers/net/skfp/fplustm.c @@ -549,12 +549,12 @@ void formac_tx_restart(struct s_smc *smc) static void enable_formac(struct s_smc *smc) { /* set formac IMSK : 0 enables irq */ - outpw(FM_A(FM_IMSK1U),~mac_imsk1u) ; - outpw(FM_A(FM_IMSK1L),~mac_imsk1l) ; - outpw(FM_A(FM_IMSK2U),~mac_imsk2u) ; - outpw(FM_A(FM_IMSK2L),~mac_imsk2l) ; - outpw(FM_A(FM_IMSK3U),~mac_imsk3u) ; - outpw(FM_A(FM_IMSK3L),~mac_imsk3l) ; + outpw(FM_A(FM_IMSK1U),(unsigned short)~mac_imsk1u); + outpw(FM_A(FM_IMSK1L),(unsigned short)~mac_imsk1l); + outpw(FM_A(FM_IMSK2U),(unsigned short)~mac_imsk2u); + outpw(FM_A(FM_IMSK2L),(unsigned short)~mac_imsk2l); + outpw(FM_A(FM_IMSK3U),(unsigned short)~mac_imsk3u); + outpw(FM_A(FM_IMSK3L),(unsigned short)~mac_imsk3l); } #if 0 /* Removed because the driver should use the ASICs TX complete IRQ. */ diff --git a/drivers/net/skge.c b/drivers/net/skge.c index 25e028b7ce4..4eda81d41b1 100644 --- a/drivers/net/skge.c +++ b/drivers/net/skge.c @@ -44,7 +44,7 @@ #include "skge.h" #define DRV_NAME "skge" -#define DRV_VERSION "1.3" +#define DRV_VERSION "1.4" #define PFX DRV_NAME " " #define DEFAULT_TX_RING_SIZE 128 @@ -104,7 +104,6 @@ static const int txqaddr[] = { Q_XA1, Q_XA2 }; static const int rxqaddr[] = { Q_R1, Q_R2 }; static const u32 rxirqmask[] = { IS_R1_F, IS_R2_F }; static const u32 txirqmask[] = { IS_XA1_F, IS_XA2_F }; -static const u32 portirqmask[] = { IS_PORT_1, IS_PORT_2 }; static int skge_get_regs_len(struct net_device *dev) { @@ -728,19 +727,18 @@ static struct ethtool_ops skge_ethtool_ops = { * Allocate ring elements and chain them together * One-to-one association of board descriptors with ring elements */ -static int skge_ring_alloc(struct skge_ring *ring, void *vaddr, u64 base) +static int skge_ring_alloc(struct skge_ring *ring, void *vaddr, u32 base) { struct skge_tx_desc *d; struct skge_element *e; int i; - ring->start = kmalloc(sizeof(*e)*ring->count, GFP_KERNEL); + ring->start = kcalloc(sizeof(*e), ring->count, GFP_KERNEL); if (!ring->start) return -ENOMEM; for (i = 0, e = ring->start, d = vaddr; i < ring->count; i++, e++, d++) { e->desc = d; - e->skb = NULL; if (i == ring->count - 1) { e->next = ring->start; d->next_offset = base; @@ -2169,27 +2167,31 @@ static int skge_up(struct net_device *dev) if (!skge->mem) return -ENOMEM; + BUG_ON(skge->dma & 7); + + if ((u64)skge->dma >> 32 != ((u64) skge->dma + skge->mem_size) >> 32) { + printk(KERN_ERR PFX "pci_alloc_consistent region crosses 4G boundary\n"); + err = -EINVAL; + goto free_pci_mem; + } + memset(skge->mem, 0, skge->mem_size); - if ((err = skge_ring_alloc(&skge->rx_ring, skge->mem, skge->dma))) + err = skge_ring_alloc(&skge->rx_ring, skge->mem, skge->dma); + if (err) goto free_pci_mem; err = skge_rx_fill(skge); if (err) goto free_rx_ring; - if ((err = skge_ring_alloc(&skge->tx_ring, skge->mem + rx_size, - skge->dma + rx_size))) + err = skge_ring_alloc(&skge->tx_ring, skge->mem + rx_size, + skge->dma + rx_size); + if (err) goto free_rx_ring; skge->tx_avail = skge->tx_ring.count - 1; - /* Enable IRQ from port */ - spin_lock_irq(&hw->hw_lock); - hw->intr_mask |= portirqmask[port]; - skge_write32(hw, B0_IMSK, hw->intr_mask); - spin_unlock_irq(&hw->hw_lock); - /* Initialize MAC */ spin_lock_bh(&hw->phy_lock); if (hw->chip_id == CHIP_ID_GENESIS) @@ -2246,11 +2248,6 @@ static int skge_down(struct net_device *dev) else yukon_stop(skge); - spin_lock_irq(&hw->hw_lock); - hw->intr_mask &= ~portirqmask[skge->port]; - skge_write32(hw, B0_IMSK, hw->intr_mask); - spin_unlock_irq(&hw->hw_lock); - /* Stop transmitter */ skge_write8(hw, Q_ADDR(txqaddr[port], Q_CSR), CSR_STOP); skge_write32(hw, RB_ADDR(txqaddr[port], RB_CTRL), @@ -2307,18 +2304,15 @@ static int skge_xmit_frame(struct sk_buff *skb, struct net_device *dev) int i; u32 control, len; u64 map; - unsigned long flags; skb = skb_padto(skb, ETH_ZLEN); if (!skb) return NETDEV_TX_OK; - local_irq_save(flags); if (!spin_trylock(&skge->tx_lock)) { - /* Collision - tell upper layer to requeue */ - local_irq_restore(flags); - return NETDEV_TX_LOCKED; - } + /* Collision - tell upper layer to requeue */ + return NETDEV_TX_LOCKED; + } if (unlikely(skge->tx_avail < skb_shinfo(skb)->nr_frags +1)) { if (!netif_queue_stopped(dev)) { @@ -2327,7 +2321,7 @@ static int skge_xmit_frame(struct sk_buff *skb, struct net_device *dev) printk(KERN_WARNING PFX "%s: ring full when queue awake!\n", dev->name); } - spin_unlock_irqrestore(&skge->tx_lock, flags); + spin_unlock(&skge->tx_lock); return NETDEV_TX_BUSY; } @@ -2402,8 +2396,10 @@ static int skge_xmit_frame(struct sk_buff *skb, struct net_device *dev) netif_stop_queue(dev); } + mmiowb(); + spin_unlock(&skge->tx_lock); + dev->trans_start = jiffies; - spin_unlock_irqrestore(&skge->tx_lock, flags); return NETDEV_TX_OK; } @@ -2416,7 +2412,7 @@ static inline void skge_tx_free(struct skge_hw *hw, struct skge_element *e) pci_unmap_addr(e, mapaddr), pci_unmap_len(e, maplen), PCI_DMA_TODEVICE); - dev_kfree_skb_any(e->skb); + dev_kfree_skb(e->skb); e->skb = NULL; } else { pci_unmap_page(hw->pdev, @@ -2430,15 +2426,14 @@ static void skge_tx_clean(struct skge_port *skge) { struct skge_ring *ring = &skge->tx_ring; struct skge_element *e; - unsigned long flags; - spin_lock_irqsave(&skge->tx_lock, flags); + spin_lock_bh(&skge->tx_lock); for (e = ring->to_clean; e != ring->to_use; e = e->next) { ++skge->tx_avail; skge_tx_free(skge->hw, e); } ring->to_clean = e; - spin_unlock_irqrestore(&skge->tx_lock, flags); + spin_unlock_bh(&skge->tx_lock); } static void skge_tx_timeout(struct net_device *dev) @@ -2663,6 +2658,37 @@ resubmit: return NULL; } +static void skge_tx_done(struct skge_port *skge) +{ + struct skge_ring *ring = &skge->tx_ring; + struct skge_element *e; + + spin_lock(&skge->tx_lock); + for (e = ring->to_clean; prefetch(e->next), e != ring->to_use; e = e->next) { + struct skge_tx_desc *td = e->desc; + u32 control; + + rmb(); + control = td->control; + if (control & BMU_OWN) + break; + + if (unlikely(netif_msg_tx_done(skge))) + printk(KERN_DEBUG PFX "%s: tx done slot %td status 0x%x\n", + skge->netdev->name, e - ring->start, td->status); + + skge_tx_free(skge->hw, e); + e->skb = NULL; + ++skge->tx_avail; + } + ring->to_clean = e; + skge_write8(skge->hw, Q_ADDR(txqaddr[skge->port], Q_CSR), CSR_IRQ_CL_F); + + if (skge->tx_avail > MAX_SKB_FRAGS + 1) + netif_wake_queue(skge->netdev); + + spin_unlock(&skge->tx_lock); +} static int skge_poll(struct net_device *dev, int *budget) { @@ -2670,8 +2696,10 @@ static int skge_poll(struct net_device *dev, int *budget) struct skge_hw *hw = skge->hw; struct skge_ring *ring = &skge->rx_ring; struct skge_element *e; - unsigned int to_do = min(dev->quota, *budget); - unsigned int work_done = 0; + int to_do = min(dev->quota, *budget); + int work_done = 0; + + skge_tx_done(skge); for (e = ring->to_clean; prefetch(e->next), work_done < to_do; e = e->next) { struct skge_rx_desc *rd = e->desc; @@ -2683,8 +2711,8 @@ static int skge_poll(struct net_device *dev, int *budget) if (control & BMU_OWN) break; - skb = skge_rx_get(skge, e, control, rd->status, - le16_to_cpu(rd->csum2)); + skb = skge_rx_get(skge, e, control, rd->status, + le16_to_cpu(rd->csum2)); if (likely(skb)) { dev->last_rx = jiffies; netif_receive_skb(skb); @@ -2705,49 +2733,15 @@ static int skge_poll(struct net_device *dev, int *budget) if (work_done >= to_do) return 1; /* not done */ - spin_lock_irq(&hw->hw_lock); - __netif_rx_complete(dev); - hw->intr_mask |= portirqmask[skge->port]; + netif_rx_complete(dev); + mmiowb(); + + hw->intr_mask |= skge->port == 0 ? (IS_R1_F|IS_XA1_F) : (IS_R2_F|IS_XA2_F); skge_write32(hw, B0_IMSK, hw->intr_mask); - spin_unlock_irq(&hw->hw_lock); return 0; } -static inline void skge_tx_intr(struct net_device *dev) -{ - struct skge_port *skge = netdev_priv(dev); - struct skge_hw *hw = skge->hw; - struct skge_ring *ring = &skge->tx_ring; - struct skge_element *e; - - spin_lock(&skge->tx_lock); - for (e = ring->to_clean; prefetch(e->next), e != ring->to_use; e = e->next) { - struct skge_tx_desc *td = e->desc; - u32 control; - - rmb(); - control = td->control; - if (control & BMU_OWN) - break; - - if (unlikely(netif_msg_tx_done(skge))) - printk(KERN_DEBUG PFX "%s: tx done slot %td status 0x%x\n", - dev->name, e - ring->start, td->status); - - skge_tx_free(hw, e); - e->skb = NULL; - ++skge->tx_avail; - } - ring->to_clean = e; - skge_write8(hw, Q_ADDR(txqaddr[skge->port], Q_CSR), CSR_IRQ_CL_F); - - if (skge->tx_avail > MAX_SKB_FRAGS + 1) - netif_wake_queue(dev); - - spin_unlock(&skge->tx_lock); -} - /* Parity errors seem to happen when Genesis is connected to a switch * with no other ports present. Heartbeat error?? */ @@ -2770,17 +2764,6 @@ static void skge_mac_parity(struct skge_hw *hw, int port) ? GMF_CLI_TX_FC : GMF_CLI_TX_PE); } -static void skge_pci_clear(struct skge_hw *hw) -{ - u16 status; - - pci_read_config_word(hw->pdev, PCI_STATUS, &status); - skge_write8(hw, B2_TST_CTRL1, TST_CFG_WRITE_ON); - pci_write_config_word(hw->pdev, PCI_STATUS, - status | PCI_STATUS_ERROR_BITS); - skge_write8(hw, B2_TST_CTRL1, TST_CFG_WRITE_OFF); -} - static void skge_mac_intr(struct skge_hw *hw, int port) { if (hw->chip_id == CHIP_ID_GENESIS) @@ -2822,23 +2805,39 @@ static void skge_error_irq(struct skge_hw *hw) if (hwstatus & IS_M2_PAR_ERR) skge_mac_parity(hw, 1); - if (hwstatus & IS_R1_PAR_ERR) + if (hwstatus & IS_R1_PAR_ERR) { + printk(KERN_ERR PFX "%s: receive queue parity error\n", + hw->dev[0]->name); skge_write32(hw, B0_R1_CSR, CSR_IRQ_CL_P); + } - if (hwstatus & IS_R2_PAR_ERR) + if (hwstatus & IS_R2_PAR_ERR) { + printk(KERN_ERR PFX "%s: receive queue parity error\n", + hw->dev[1]->name); skge_write32(hw, B0_R2_CSR, CSR_IRQ_CL_P); + } if (hwstatus & (IS_IRQ_MST_ERR|IS_IRQ_STAT)) { - printk(KERN_ERR PFX "hardware error detected (status 0x%x)\n", - hwstatus); + u16 pci_status, pci_cmd; + + pci_read_config_word(hw->pdev, PCI_COMMAND, &pci_cmd); + pci_read_config_word(hw->pdev, PCI_STATUS, &pci_status); - skge_pci_clear(hw); + printk(KERN_ERR PFX "%s: PCI error cmd=%#x status=%#x\n", + pci_name(hw->pdev), pci_cmd, pci_status); + + /* Write the error bits back to clear them. */ + pci_status &= PCI_STATUS_ERROR_BITS; + skge_write8(hw, B2_TST_CTRL1, TST_CFG_WRITE_ON); + pci_write_config_word(hw->pdev, PCI_COMMAND, + pci_cmd | PCI_COMMAND_SERR | PCI_COMMAND_PARITY); + pci_write_config_word(hw->pdev, PCI_STATUS, pci_status); + skge_write8(hw, B2_TST_CTRL1, TST_CFG_WRITE_OFF); /* if error still set then just ignore it */ hwstatus = skge_read32(hw, B0_HWE_ISRC); if (hwstatus & IS_IRQ_STAT) { - pr_debug("IRQ status %x: still set ignoring hardware errors\n", - hwstatus); + printk(KERN_INFO PFX "unable to clear error (so ignoring them)\n"); hw->intr_mask &= ~IS_HW_ERR; } } @@ -2855,12 +2854,11 @@ static void skge_extirq(unsigned long data) int port; spin_lock(&hw->phy_lock); - for (port = 0; port < 2; port++) { + for (port = 0; port < hw->ports; port++) { struct net_device *dev = hw->dev[port]; + struct skge_port *skge = netdev_priv(dev); - if (dev && netif_running(dev)) { - struct skge_port *skge = netdev_priv(dev); - + if (netif_running(dev)) { if (hw->chip_id != CHIP_ID_GENESIS) yukon_phy_intr(skge); else @@ -2869,38 +2867,39 @@ static void skge_extirq(unsigned long data) } spin_unlock(&hw->phy_lock); - spin_lock_irq(&hw->hw_lock); hw->intr_mask |= IS_EXT_REG; skge_write32(hw, B0_IMSK, hw->intr_mask); - spin_unlock_irq(&hw->hw_lock); } static irqreturn_t skge_intr(int irq, void *dev_id, struct pt_regs *regs) { struct skge_hw *hw = dev_id; - u32 status = skge_read32(hw, B0_SP_ISRC); + u32 status; - if (status == 0 || status == ~0) /* hotplug or shared irq */ + /* Reading this register masks IRQ */ + status = skge_read32(hw, B0_SP_ISRC); + if (status == 0) return IRQ_NONE; - spin_lock(&hw->hw_lock); - if (status & IS_R1_F) { + if (status & IS_EXT_REG) { + hw->intr_mask &= ~IS_EXT_REG; + tasklet_schedule(&hw->ext_tasklet); + } + + if (status & (IS_R1_F|IS_XA1_F)) { skge_write8(hw, Q_ADDR(Q_R1, Q_CSR), CSR_IRQ_CL_F); - hw->intr_mask &= ~IS_R1_F; + hw->intr_mask &= ~(IS_R1_F|IS_XA1_F); netif_rx_schedule(hw->dev[0]); } - if (status & IS_R2_F) { + if (status & (IS_R2_F|IS_XA2_F)) { skge_write8(hw, Q_ADDR(Q_R2, Q_CSR), CSR_IRQ_CL_F); - hw->intr_mask &= ~IS_R2_F; + hw->intr_mask &= ~(IS_R2_F|IS_XA2_F); netif_rx_schedule(hw->dev[1]); } - if (status & IS_XA1_F) - skge_tx_intr(hw->dev[0]); - - if (status & IS_XA2_F) - skge_tx_intr(hw->dev[1]); + if (likely((status & hw->intr_mask) == 0)) + return IRQ_HANDLED; if (status & IS_PA_TO_RX1) { struct skge_port *skge = netdev_priv(hw->dev[0]); @@ -2929,13 +2928,7 @@ static irqreturn_t skge_intr(int irq, void *dev_id, struct pt_regs *regs) if (status & IS_HW_ERR) skge_error_irq(hw); - if (status & IS_EXT_REG) { - hw->intr_mask &= ~IS_EXT_REG; - tasklet_schedule(&hw->ext_tasklet); - } - skge_write32(hw, B0_IMSK, hw->intr_mask); - spin_unlock(&hw->hw_lock); return IRQ_HANDLED; } @@ -3010,7 +3003,7 @@ static const char *skge_board_name(const struct skge_hw *hw) static int skge_reset(struct skge_hw *hw) { u32 reg; - u16 ctst; + u16 ctst, pci_status; u8 t8, mac_cfg, pmd_type, phy_type; int i; @@ -3021,8 +3014,13 @@ static int skge_reset(struct skge_hw *hw) skge_write8(hw, B0_CTST, CS_RST_CLR); /* clear PCI errors, if any */ - skge_pci_clear(hw); + skge_write8(hw, B2_TST_CTRL1, TST_CFG_WRITE_ON); + skge_write8(hw, B2_TST_CTRL2, 0); + pci_read_config_word(hw->pdev, PCI_STATUS, &pci_status); + pci_write_config_word(hw->pdev, PCI_STATUS, + pci_status | PCI_STATUS_ERROR_BITS); + skge_write8(hw, B2_TST_CTRL1, TST_CFG_WRITE_OFF); skge_write8(hw, B0_CTST, CS_MRST_CLR); /* restore CLK_RUN bits (for Yukon-Lite) */ @@ -3081,7 +3079,10 @@ static int skge_reset(struct skge_hw *hw) else hw->ram_size = t8 * 4096; - hw->intr_mask = IS_HW_ERR | IS_EXT_REG; + hw->intr_mask = IS_HW_ERR | IS_EXT_REG | IS_PORT_1; + if (hw->ports > 1) + hw->intr_mask |= IS_PORT_2; + if (hw->chip_id == CHIP_ID_GENESIS) genesis_init(hw); else { @@ -3251,13 +3252,15 @@ static int __devinit skge_probe(struct pci_dev *pdev, struct skge_hw *hw; int err, using_dac = 0; - if ((err = pci_enable_device(pdev))) { + err = pci_enable_device(pdev); + if (err) { printk(KERN_ERR PFX "%s cannot enable PCI device\n", pci_name(pdev)); goto err_out; } - if ((err = pci_request_regions(pdev, DRV_NAME))) { + err = pci_request_regions(pdev, DRV_NAME); + if (err) { printk(KERN_ERR PFX "%s cannot obtain PCI resources\n", pci_name(pdev)); goto err_out_disable_pdev; @@ -3265,22 +3268,18 @@ static int __devinit skge_probe(struct pci_dev *pdev, pci_set_master(pdev); - if (sizeof(dma_addr_t) > sizeof(u32) && - !(err = pci_set_dma_mask(pdev, DMA_64BIT_MASK))) { + if (!pci_set_dma_mask(pdev, DMA_64BIT_MASK)) { using_dac = 1; err = pci_set_consistent_dma_mask(pdev, DMA_64BIT_MASK); - if (err < 0) { - printk(KERN_ERR PFX "%s unable to obtain 64 bit DMA " - "for consistent allocations\n", pci_name(pdev)); - goto err_out_free_regions; - } - } else { - err = pci_set_dma_mask(pdev, DMA_32BIT_MASK); - if (err) { - printk(KERN_ERR PFX "%s no usable DMA configuration\n", - pci_name(pdev)); - goto err_out_free_regions; - } + } else if (!(err = pci_set_dma_mask(pdev, DMA_32BIT_MASK))) { + using_dac = 0; + err = pci_set_consistent_dma_mask(pdev, DMA_32BIT_MASK); + } + + if (err) { + printk(KERN_ERR PFX "%s no usable DMA configuration\n", + pci_name(pdev)); + goto err_out_free_regions; } #ifdef __BIG_ENDIAN @@ -3304,7 +3303,6 @@ static int __devinit skge_probe(struct pci_dev *pdev, hw->pdev = pdev; spin_lock_init(&hw->phy_lock); - spin_lock_init(&hw->hw_lock); tasklet_init(&hw->ext_tasklet, skge_extirq, (unsigned long) hw); hw->regs = ioremap_nocache(pci_resource_start(pdev, 0), 0x4000); @@ -3314,7 +3312,8 @@ static int __devinit skge_probe(struct pci_dev *pdev, goto err_out_free_hw; } - if ((err = request_irq(pdev->irq, skge_intr, SA_SHIRQ, DRV_NAME, hw))) { + err = request_irq(pdev->irq, skge_intr, SA_SHIRQ, DRV_NAME, hw); + if (err) { printk(KERN_ERR PFX "%s: cannot assign irq %d\n", pci_name(pdev), pdev->irq); goto err_out_iounmap; @@ -3332,7 +3331,8 @@ static int __devinit skge_probe(struct pci_dev *pdev, if ((dev = skge_devinit(hw, 0, using_dac)) == NULL) goto err_out_led_off; - if ((err = register_netdev(dev))) { + err = register_netdev(dev); + if (err) { printk(KERN_ERR PFX "%s: cannot register net device\n", pci_name(pdev)); goto err_out_free_netdev; @@ -3387,7 +3387,6 @@ static void __devexit skge_remove(struct pci_dev *pdev) skge_write32(hw, B0_IMSK, 0); skge_write16(hw, B0_LED, LED_STAT_OFF); - skge_pci_clear(hw); skge_write8(hw, B0_CTST, CS_RST_SET); tasklet_kill(&hw->ext_tasklet); diff --git a/drivers/net/skge.h b/drivers/net/skge.h index 941f12a333b..2efdacc290e 100644 --- a/drivers/net/skge.h +++ b/drivers/net/skge.h @@ -2402,7 +2402,6 @@ struct skge_hw { struct tasklet_struct ext_tasklet; spinlock_t phy_lock; - spinlock_t hw_lock; }; enum { diff --git a/drivers/net/sky2.c b/drivers/net/sky2.c index 73260364cba..f08fe6c884b 100644 --- a/drivers/net/sky2.c +++ b/drivers/net/sky2.c @@ -51,7 +51,7 @@ #include "sky2.h" #define DRV_NAME "sky2" -#define DRV_VERSION "0.15" +#define DRV_VERSION "1.1" #define PFX DRV_NAME " " /* @@ -61,10 +61,6 @@ * a receive requires one (or two if using 64 bit dma). */ -#define is_ec_a1(hw) \ - unlikely((hw)->chip_id == CHIP_ID_YUKON_EC && \ - (hw)->chip_rev == CHIP_REV_YU_EC_A1) - #define RX_LE_SIZE 512 #define RX_LE_BYTES (RX_LE_SIZE*sizeof(struct sky2_rx_le)) #define RX_MAX_PENDING (RX_LE_SIZE/2 - 2) @@ -96,6 +92,10 @@ static int copybreak __read_mostly = 256; module_param(copybreak, int, 0); MODULE_PARM_DESC(copybreak, "Receive copy threshold"); +static int disable_msi = 0; +module_param(disable_msi, int, 0); +MODULE_PARM_DESC(disable_msi, "Disable Message Signaled Interrupt (MSI)"); + static const struct pci_device_id sky2_id_table[] = { { PCI_DEVICE(PCI_VENDOR_ID_SYSKONNECT, 0x9000) }, { PCI_DEVICE(PCI_VENDOR_ID_SYSKONNECT, 0x9E00) }, @@ -504,9 +504,9 @@ static void sky2_phy_init(struct sky2_hw *hw, unsigned port) /* Force a renegotiation */ static void sky2_phy_reinit(struct sky2_port *sky2) { - down(&sky2->phy_sema); + spin_lock_bh(&sky2->phy_lock); sky2_phy_init(sky2->hw, sky2->port); - up(&sky2->phy_sema); + spin_unlock_bh(&sky2->phy_lock); } static void sky2_mac_init(struct sky2_hw *hw, unsigned port) @@ -571,9 +571,9 @@ static void sky2_mac_init(struct sky2_hw *hw, unsigned port) sky2_read16(hw, SK_REG(port, GMAC_IRQ_SRC)); - down(&sky2->phy_sema); + spin_lock_bh(&sky2->phy_lock); sky2_phy_init(hw, port); - up(&sky2->phy_sema); + spin_unlock_bh(&sky2->phy_lock); /* MIB clear */ reg = gma_read16(hw, port, GM_PHY_ADDR); @@ -725,37 +725,11 @@ static inline struct sky2_tx_le *get_tx_le(struct sky2_port *sky2) return le; } -/* - * This is a workaround code taken from SysKonnect sk98lin driver - * to deal with chip bug on Yukon EC rev 0 in the wraparound case. - */ -static void sky2_put_idx(struct sky2_hw *hw, unsigned q, - u16 idx, u16 *last, u16 size) +/* Update chip's next pointer */ +static inline void sky2_put_idx(struct sky2_hw *hw, unsigned q, u16 idx) { wmb(); - if (is_ec_a1(hw) && idx < *last) { - u16 hwget = sky2_read16(hw, Y2_QADDR(q, PREF_UNIT_GET_IDX)); - - if (hwget == 0) { - /* Start prefetching again */ - sky2_write8(hw, Y2_QADDR(q, PREF_UNIT_FIFO_WM), 0xe0); - goto setnew; - } - - if (hwget == size - 1) { - /* set watermark to one list element */ - sky2_write8(hw, Y2_QADDR(q, PREF_UNIT_FIFO_WM), 8); - - /* set put index to first list element */ - sky2_write16(hw, Y2_QADDR(q, PREF_UNIT_PUT_IDX), 0); - } else /* have hardware go to end of list */ - sky2_write16(hw, Y2_QADDR(q, PREF_UNIT_PUT_IDX), - size - 1); - } else { -setnew: - sky2_write16(hw, Y2_QADDR(q, PREF_UNIT_PUT_IDX), idx); - } - *last = idx; + sky2_write16(hw, Y2_QADDR(q, PREF_UNIT_PUT_IDX), idx); mmiowb(); } @@ -878,7 +852,7 @@ static int sky2_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) if (!netif_running(dev)) return -ENODEV; /* Phy still in reset */ - switch(cmd) { + switch (cmd) { case SIOCGMIIPHY: data->phy_id = PHY_ADDR_MARV; @@ -886,9 +860,9 @@ static int sky2_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) case SIOCGMIIREG: { u16 val = 0; - down(&sky2->phy_sema); + spin_lock_bh(&sky2->phy_lock); err = __gm_phy_read(hw, sky2->port, data->reg_num & 0x1f, &val); - up(&sky2->phy_sema); + spin_unlock_bh(&sky2->phy_lock); data->val_out = val; break; @@ -898,10 +872,10 @@ static int sky2_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) if (!capable(CAP_NET_ADMIN)) return -EPERM; - down(&sky2->phy_sema); + spin_lock_bh(&sky2->phy_lock); err = gm_phy_write(hw, sky2->port, data->reg_num & 0x1f, data->val_in); - up(&sky2->phy_sema); + spin_unlock_bh(&sky2->phy_lock); break; } return err; @@ -1001,7 +975,6 @@ static int sky2_rx_start(struct sky2_port *sky2) /* Tell chip about available buffers */ sky2_write16(hw, Y2_QADDR(rxq, PREF_UNIT_PUT_IDX), sky2->rx_put); - sky2->rx_last_put = sky2_read16(hw, Y2_QADDR(rxq, PREF_UNIT_PUT_IDX)); return 0; nomem: sky2_rx_clean(sky2); @@ -1014,7 +987,7 @@ static int sky2_up(struct net_device *dev) struct sky2_port *sky2 = netdev_priv(dev); struct sky2_hw *hw = sky2->hw; unsigned port = sky2->port; - u32 ramsize, rxspace; + u32 ramsize, rxspace, imask; int err = -ENOMEM; if (netif_msg_ifup(sky2)) @@ -1079,10 +1052,10 @@ static int sky2_up(struct net_device *dev) goto err_out; /* Enable interrupts from phy/mac for port */ - spin_lock_irq(&hw->hw_lock); - hw->intr_mask |= (port == 0) ? Y2_IS_PORT_1 : Y2_IS_PORT_2; - sky2_write32(hw, B0_IMSK, hw->intr_mask); - spin_unlock_irq(&hw->hw_lock); + imask = sky2_read32(hw, B0_IMSK); + imask |= (port == 0) ? Y2_IS_PORT_1 : Y2_IS_PORT_2; + sky2_write32(hw, B0_IMSK, imask); + return 0; err_out: @@ -1299,8 +1272,7 @@ static int sky2_xmit_frame(struct sk_buff *skb, struct net_device *dev) netif_stop_queue(dev); } - sky2_put_idx(hw, txqaddr[sky2->port], sky2->tx_prod, - &sky2->tx_last_put, TX_RING_SIZE); + sky2_put_idx(hw, txqaddr[sky2->port], sky2->tx_prod); out_unlock: spin_unlock(&sky2->tx_lock); @@ -1332,7 +1304,7 @@ static void sky2_tx_complete(struct sky2_port *sky2, u16 done) struct tx_ring_info *re = sky2->tx_ring + put; struct sk_buff *skb = re->skb; - nxt = re->idx; + nxt = re->idx; BUG_ON(nxt >= TX_RING_SIZE); prefetch(sky2->tx_ring + nxt); @@ -1348,7 +1320,7 @@ static void sky2_tx_complete(struct sky2_port *sky2, u16 done) struct tx_ring_info *fre; fre = sky2->tx_ring + (put + i + 1) % TX_RING_SIZE; pci_unmap_page(pdev, pci_unmap_addr(fre, mapaddr), - skb_shinfo(skb)->frags[i].size, + skb_shinfo(skb)->frags[i].size, PCI_DMA_TODEVICE); } @@ -1356,7 +1328,7 @@ static void sky2_tx_complete(struct sky2_port *sky2, u16 done) } sky2->tx_cons = put; - if (netif_queue_stopped(dev) && tx_avail(sky2) > MAX_SKB_TX_LE) + if (tx_avail(sky2) > MAX_SKB_TX_LE) netif_wake_queue(dev); } @@ -1375,6 +1347,7 @@ static int sky2_down(struct net_device *dev) struct sky2_hw *hw = sky2->hw; unsigned port = sky2->port; u16 ctrl; + u32 imask; /* Never really got started! */ if (!sky2->tx_le) @@ -1386,14 +1359,6 @@ static int sky2_down(struct net_device *dev) /* Stop more packets from being queued */ netif_stop_queue(dev); - /* Disable port IRQ */ - spin_lock_irq(&hw->hw_lock); - hw->intr_mask &= ~((sky2->port == 0) ? Y2_IS_IRQ_PHY1 : Y2_IS_IRQ_PHY2); - sky2_write32(hw, B0_IMSK, hw->intr_mask); - spin_unlock_irq(&hw->hw_lock); - - flush_scheduled_work(); - sky2_phy_reset(hw, port); /* Stop transmitter */ @@ -1437,6 +1402,11 @@ static int sky2_down(struct net_device *dev) sky2_write8(hw, SK_REG(port, RX_GMF_CTRL_T), GMF_RST_SET); sky2_write8(hw, SK_REG(port, TX_GMF_CTRL_T), GMF_RST_SET); + /* Disable port IRQ */ + imask = sky2_read32(hw, B0_IMSK); + imask &= ~(sky2->port == 0) ? Y2_IS_PORT_1 : Y2_IS_PORT_2; + sky2_write32(hw, B0_IMSK, imask); + /* turn off LED's */ sky2_write16(hw, B0_Y2LED, LED_STAT_OFF); @@ -1631,20 +1601,19 @@ static int sky2_autoneg_done(struct sky2_port *sky2, u16 aux) return 0; } -/* - * Interrupt from PHY are handled outside of interrupt context - * because accessing phy registers requires spin wait which might - * cause excess interrupt latency. - */ -static void sky2_phy_task(void *arg) +/* Interrupt from PHY */ +static void sky2_phy_intr(struct sky2_hw *hw, unsigned port) { - struct sky2_port *sky2 = arg; - struct sky2_hw *hw = sky2->hw; + struct net_device *dev = hw->dev[port]; + struct sky2_port *sky2 = netdev_priv(dev); u16 istatus, phystat; - down(&sky2->phy_sema); - istatus = gm_phy_read(hw, sky2->port, PHY_MARV_INT_STAT); - phystat = gm_phy_read(hw, sky2->port, PHY_MARV_PHY_STAT); + spin_lock(&sky2->phy_lock); + istatus = gm_phy_read(hw, port, PHY_MARV_INT_STAT); + phystat = gm_phy_read(hw, port, PHY_MARV_PHY_STAT); + + if (!netif_running(dev)) + goto out; if (netif_msg_intr(sky2)) printk(KERN_INFO PFX "%s: phy interrupt status 0x%x 0x%x\n", @@ -1670,12 +1639,7 @@ static void sky2_phy_task(void *arg) sky2_link_down(sky2); } out: - up(&sky2->phy_sema); - - spin_lock_irq(&hw->hw_lock); - hw->intr_mask |= (sky2->port == 0) ? Y2_IS_IRQ_PHY1 : Y2_IS_IRQ_PHY2; - sky2_write32(hw, B0_IMSK, hw->intr_mask); - spin_unlock_irq(&hw->hw_lock); + spin_unlock(&sky2->phy_lock); } @@ -1687,31 +1651,40 @@ static void sky2_tx_timeout(struct net_device *dev) struct sky2_port *sky2 = netdev_priv(dev); struct sky2_hw *hw = sky2->hw; unsigned txq = txqaddr[sky2->port]; - u16 ridx; - - /* Maybe we just missed an status interrupt */ - spin_lock(&sky2->tx_lock); - ridx = sky2_read16(hw, - sky2->port == 0 ? STAT_TXA1_RIDX : STAT_TXA2_RIDX); - sky2_tx_complete(sky2, ridx); - spin_unlock(&sky2->tx_lock); - - if (!netif_queue_stopped(dev)) { - if (net_ratelimit()) - pr_info(PFX "transmit interrupt missed? recovered\n"); - return; - } + u16 report, done; if (netif_msg_timer(sky2)) printk(KERN_ERR PFX "%s: tx timeout\n", dev->name); - sky2_write32(hw, Q_ADDR(txq, Q_CSR), BMU_STOP); - sky2_write32(hw, Y2_QADDR(txq, PREF_UNIT_CTRL), PREF_UNIT_RST_SET); + report = sky2_read16(hw, sky2->port == 0 ? STAT_TXA1_RIDX : STAT_TXA2_RIDX); + done = sky2_read16(hw, Q_ADDR(txq, Q_DONE)); - sky2_tx_clean(sky2); + printk(KERN_DEBUG PFX "%s: transmit ring %u .. %u report=%u done=%u\n", + dev->name, + sky2->tx_cons, sky2->tx_prod, report, done); - sky2_qset(hw, txq); - sky2_prefetch_init(hw, txq, sky2->tx_le_map, TX_RING_SIZE - 1); + if (report != done) { + printk(KERN_INFO PFX "status burst pending (irq moderation?)\n"); + + sky2_write8(hw, STAT_TX_TIMER_CTRL, TIM_STOP); + sky2_write8(hw, STAT_TX_TIMER_CTRL, TIM_START); + } else if (report != sky2->tx_cons) { + printk(KERN_INFO PFX "status report lost?\n"); + + spin_lock_bh(&sky2->tx_lock); + sky2_tx_complete(sky2, report); + spin_unlock_bh(&sky2->tx_lock); + } else { + printk(KERN_INFO PFX "hardware hung? flushing\n"); + + sky2_write32(hw, Q_ADDR(txq, Q_CSR), BMU_STOP); + sky2_write32(hw, Y2_QADDR(txq, PREF_UNIT_CTRL), PREF_UNIT_RST_SET); + + sky2_tx_clean(sky2); + + sky2_qset(hw, txq); + sky2_prefetch_init(hw, txq, sky2->tx_le_map, TX_RING_SIZE - 1); + } } @@ -1730,6 +1703,7 @@ static int sky2_change_mtu(struct net_device *dev, int new_mtu) struct sky2_hw *hw = sky2->hw; int err; u16 ctl, mode; + u32 imask; if (new_mtu < ETH_ZLEN || new_mtu > ETH_JUMBO_MTU) return -EINVAL; @@ -1742,12 +1716,15 @@ static int sky2_change_mtu(struct net_device *dev, int new_mtu) return 0; } + imask = sky2_read32(hw, B0_IMSK); sky2_write32(hw, B0_IMSK, 0); dev->trans_start = jiffies; /* prevent tx timeout */ netif_stop_queue(dev); netif_poll_disable(hw->dev[0]); + synchronize_irq(hw->pdev->irq); + ctl = gma_read16(hw, sky2->port, GM_GP_CTRL); gma_write16(hw, sky2->port, GM_GP_CTRL, ctl & ~GM_GPCR_RX_ENA); sky2_rx_stop(sky2); @@ -1766,7 +1743,7 @@ static int sky2_change_mtu(struct net_device *dev, int new_mtu) sky2_write8(hw, RB_ADDR(rxqaddr[sky2->port], RB_CTRL), RB_ENA_OP_MD); err = sky2_rx_start(sky2); - sky2_write32(hw, B0_IMSK, hw->intr_mask); + sky2_write32(hw, B0_IMSK, imask); if (err) dev_close(dev); @@ -1843,8 +1820,7 @@ resubmit: sky2_rx_add(sky2, re->mapaddr); /* Tell receiver about new buffers. */ - sky2_put_idx(sky2->hw, rxqaddr[sky2->port], sky2->rx_put, - &sky2->rx_last_put, RX_LE_SIZE); + sky2_put_idx(sky2->hw, rxqaddr[sky2->port], sky2->rx_put); return skb; @@ -1871,76 +1847,51 @@ error: goto resubmit; } -/* - * Check for transmit complete - */ -#define TX_NO_STATUS 0xffff - -static void sky2_tx_check(struct sky2_hw *hw, int port, u16 last) +/* Transmit complete */ +static inline void sky2_tx_done(struct net_device *dev, u16 last) { - if (last != TX_NO_STATUS) { - struct net_device *dev = hw->dev[port]; - if (dev && netif_running(dev)) { - struct sky2_port *sky2 = netdev_priv(dev); + struct sky2_port *sky2 = netdev_priv(dev); - spin_lock(&sky2->tx_lock); - sky2_tx_complete(sky2, last); - spin_unlock(&sky2->tx_lock); - } + if (netif_running(dev)) { + spin_lock(&sky2->tx_lock); + sky2_tx_complete(sky2, last); + spin_unlock(&sky2->tx_lock); } } -/* - * Both ports share the same status interrupt, therefore there is only - * one poll routine. - */ -static int sky2_poll(struct net_device *dev0, int *budget) +/* Process status response ring */ +static int sky2_status_intr(struct sky2_hw *hw, int to_do) { - struct sky2_hw *hw = ((struct sky2_port *) netdev_priv(dev0))->hw; - unsigned int to_do = min(dev0->quota, *budget); - unsigned int work_done = 0; - u16 hwidx; - u16 tx_done[2] = { TX_NO_STATUS, TX_NO_STATUS }; - - sky2_write32(hw, STAT_CTRL, SC_STAT_CLR_IRQ); - - /* - * Kick the STAT_LEV_TIMER_CTRL timer. - * This fixes my hangs on Yukon-EC (0xb6) rev 1. - * The if clause is there to start the timer only if it has been - * configured correctly and not been disabled via ethtool. - */ - if (sky2_read8(hw, STAT_LEV_TIMER_CTRL) == TIM_START) { - sky2_write8(hw, STAT_LEV_TIMER_CTRL, TIM_STOP); - sky2_write8(hw, STAT_LEV_TIMER_CTRL, TIM_START); - } + int work_done = 0; - hwidx = sky2_read16(hw, STAT_PUT_IDX); - BUG_ON(hwidx >= STATUS_RING_SIZE); rmb(); - while (hwidx != hw->st_idx) { + for(;;) { struct sky2_status_le *le = hw->st_le + hw->st_idx; struct net_device *dev; struct sky2_port *sky2; struct sk_buff *skb; u32 status; u16 length; + u8 link, opcode; + + opcode = le->opcode; + if (!opcode) + break; + opcode &= ~HW_OWNER; - le = hw->st_le + hw->st_idx; hw->st_idx = (hw->st_idx + 1) % STATUS_RING_SIZE; - prefetch(hw->st_le + hw->st_idx); + le->opcode = 0; - BUG_ON(le->link >= 2); - dev = hw->dev[le->link]; - if (dev == NULL || !netif_running(dev)) - continue; + link = le->link; + BUG_ON(link >= 2); + dev = hw->dev[link]; sky2 = netdev_priv(dev); - status = le32_to_cpu(le->status); - length = le16_to_cpu(le->length); + length = le->length; + status = le->status; - switch (le->opcode & ~HW_OWNER) { + switch (opcode) { case OP_RXSTAT: skb = sky2_receive(sky2, length, status); if (!skb) @@ -1980,42 +1931,23 @@ static int sky2_poll(struct net_device *dev0, int *budget) case OP_TXINDEXLE: /* TX index reports status for both ports */ - tx_done[0] = status & 0xffff; - tx_done[1] = ((status >> 24) & 0xff) - | (u16)(length & 0xf) << 8; + sky2_tx_done(hw->dev[0], status & 0xffff); + if (hw->dev[1]) + sky2_tx_done(hw->dev[1], + ((status >> 24) & 0xff) + | (u16)(length & 0xf) << 8); break; default: if (net_ratelimit()) printk(KERN_WARNING PFX - "unknown status opcode 0x%x\n", le->opcode); + "unknown status opcode 0x%x\n", opcode); break; } } exit_loop: - sky2_tx_check(hw, 0, tx_done[0]); - sky2_tx_check(hw, 1, tx_done[1]); - - if (sky2_read8(hw, STAT_TX_TIMER_CTRL) == TIM_START) { - sky2_write8(hw, STAT_TX_TIMER_CTRL, TIM_STOP); - sky2_write8(hw, STAT_TX_TIMER_CTRL, TIM_START); - } - - if (likely(work_done < to_do)) { - spin_lock_irq(&hw->hw_lock); - __netif_rx_complete(dev0); - - hw->intr_mask |= Y2_IS_STAT_BMU; - sky2_write32(hw, B0_IMSK, hw->intr_mask); - spin_unlock_irq(&hw->hw_lock); - - return 0; - } else { - *budget -= work_done; - dev0->quota -= work_done; - return 1; - } + return work_done; } static void sky2_hw_error(struct sky2_hw *hw, unsigned port, u32 status) @@ -2134,57 +2066,97 @@ static void sky2_mac_intr(struct sky2_hw *hw, unsigned port) } } -static void sky2_phy_intr(struct sky2_hw *hw, unsigned port) +/* This should never happen it is a fatal situation */ +static void sky2_descriptor_error(struct sky2_hw *hw, unsigned port, + const char *rxtx, u32 mask) { struct net_device *dev = hw->dev[port]; struct sky2_port *sky2 = netdev_priv(dev); + u32 imask; + + printk(KERN_ERR PFX "%s: %s descriptor error (hardware problem)\n", + dev ? dev->name : "<not registered>", rxtx); - hw->intr_mask &= ~(port == 0 ? Y2_IS_IRQ_PHY1 : Y2_IS_IRQ_PHY2); - sky2_write32(hw, B0_IMSK, hw->intr_mask); + imask = sky2_read32(hw, B0_IMSK); + imask &= ~mask; + sky2_write32(hw, B0_IMSK, imask); - schedule_work(&sky2->phy_task); + if (dev) { + spin_lock(&sky2->phy_lock); + sky2_link_down(sky2); + spin_unlock(&sky2->phy_lock); + } } -static irqreturn_t sky2_intr(int irq, void *dev_id, struct pt_regs *regs) +static int sky2_poll(struct net_device *dev0, int *budget) { - struct sky2_hw *hw = dev_id; - struct net_device *dev0 = hw->dev[0]; - u32 status; + struct sky2_hw *hw = ((struct sky2_port *) netdev_priv(dev0))->hw; + int work_limit = min(dev0->quota, *budget); + int work_done = 0; + u32 status = sky2_read32(hw, B0_Y2_SP_EISR); - status = sky2_read32(hw, B0_Y2_SP_ISRC2); - if (status == 0 || status == ~0) - return IRQ_NONE; + if (unlikely(status & ~Y2_IS_STAT_BMU)) { + if (status & Y2_IS_HW_ERR) + sky2_hw_intr(hw); - spin_lock(&hw->hw_lock); - if (status & Y2_IS_HW_ERR) - sky2_hw_intr(hw); + if (status & Y2_IS_IRQ_PHY1) + sky2_phy_intr(hw, 0); - /* Do NAPI for Rx and Tx status */ - if (status & Y2_IS_STAT_BMU) { - hw->intr_mask &= ~Y2_IS_STAT_BMU; - sky2_write32(hw, B0_IMSK, hw->intr_mask); + if (status & Y2_IS_IRQ_PHY2) + sky2_phy_intr(hw, 1); - if (likely(__netif_rx_schedule_prep(dev0))) { - prefetch(&hw->st_le[hw->st_idx]); - __netif_rx_schedule(dev0); - } + if (status & Y2_IS_IRQ_MAC1) + sky2_mac_intr(hw, 0); + + if (status & Y2_IS_IRQ_MAC2) + sky2_mac_intr(hw, 1); + + if (status & Y2_IS_CHK_RX1) + sky2_descriptor_error(hw, 0, "receive", Y2_IS_CHK_RX1); + + if (status & Y2_IS_CHK_RX2) + sky2_descriptor_error(hw, 1, "receive", Y2_IS_CHK_RX2); + + if (status & Y2_IS_CHK_TXA1) + sky2_descriptor_error(hw, 0, "transmit", Y2_IS_CHK_TXA1); + + if (status & Y2_IS_CHK_TXA2) + sky2_descriptor_error(hw, 1, "transmit", Y2_IS_CHK_TXA2); } - if (status & Y2_IS_IRQ_PHY1) - sky2_phy_intr(hw, 0); + if (status & Y2_IS_STAT_BMU) { + work_done = sky2_status_intr(hw, work_limit); + *budget -= work_done; + dev0->quota -= work_done; + + if (work_done >= work_limit) + return 1; - if (status & Y2_IS_IRQ_PHY2) - sky2_phy_intr(hw, 1); + sky2_write32(hw, STAT_CTRL, SC_STAT_CLR_IRQ); + } - if (status & Y2_IS_IRQ_MAC1) - sky2_mac_intr(hw, 0); + netif_rx_complete(dev0); - if (status & Y2_IS_IRQ_MAC2) - sky2_mac_intr(hw, 1); + status = sky2_read32(hw, B0_Y2_SP_LISR); + return 0; +} - sky2_write32(hw, B0_Y2_SP_ICR, 2); +static irqreturn_t sky2_intr(int irq, void *dev_id, struct pt_regs *regs) +{ + struct sky2_hw *hw = dev_id; + struct net_device *dev0 = hw->dev[0]; + u32 status; - spin_unlock(&hw->hw_lock); + /* Reading this mask interrupts as side effect */ + status = sky2_read32(hw, B0_Y2_SP_ISRC2); + if (status == 0 || status == ~0) + return IRQ_NONE; + + prefetch(&hw->st_le[hw->st_idx]); + if (likely(__netif_rx_schedule_prep(dev0))) + __netif_rx_schedule(dev0); + else + printk(KERN_DEBUG PFX "irq race detected\n"); return IRQ_HANDLED; } @@ -2238,6 +2210,23 @@ static int sky2_reset(struct sky2_hw *hw) return -EOPNOTSUPP; } + hw->chip_rev = (sky2_read8(hw, B2_MAC_CFG) & CFG_CHIP_R_MSK) >> 4; + + /* This rev is really old, and requires untested workarounds */ + if (hw->chip_id == CHIP_ID_YUKON_EC && hw->chip_rev == CHIP_REV_YU_EC_A1) { + printk(KERN_ERR PFX "%s: unsupported revision Yukon-%s (0x%x) rev %d\n", + pci_name(hw->pdev), yukon2_name[hw->chip_id - CHIP_ID_YUKON_XL], + hw->chip_id, hw->chip_rev); + return -EOPNOTSUPP; + } + + /* This chip is new and not tested yet */ + if (hw->chip_id == CHIP_ID_YUKON_EC_U) { + pr_info(PFX "%s: is a version of Yukon 2 chipset that has not been tested yet.\n", + pci_name(hw->pdev)); + pr_info("Please report success/failure to maintainer <shemminger@osdl.org>\n"); + } + /* disable ASF */ if (hw->chip_id <= CHIP_ID_YUKON_EC) { sky2_write8(hw, B28_Y2_ASF_STAT_CMD, Y2_ASF_RESET); @@ -2258,7 +2247,7 @@ static int sky2_reset(struct sky2_hw *hw) sky2_write8(hw, B0_CTST, CS_MRST_CLR); /* clear any PEX errors */ - if (pci_find_capability(hw->pdev, PCI_CAP_ID_EXP)) + if (pci_find_capability(hw->pdev, PCI_CAP_ID_EXP)) sky2_pci_write32(hw, PEX_UNC_ERR_STAT, 0xffffffffUL); @@ -2271,7 +2260,6 @@ static int sky2_reset(struct sky2_hw *hw) if (!(sky2_read8(hw, B2_Y2_CLK_GATE) & Y2_STATUS_LNK2_INAC)) ++hw->ports; } - hw->chip_rev = (sky2_read8(hw, B2_MAC_CFG) & CFG_CHIP_R_MSK) >> 4; sky2_set_power_state(hw, PCI_D0); @@ -2337,30 +2325,18 @@ static int sky2_reset(struct sky2_hw *hw) /* Set the list last index */ sky2_write16(hw, STAT_LAST_IDX, STATUS_RING_SIZE - 1); - /* These status setup values are copied from SysKonnect's driver */ - if (is_ec_a1(hw)) { - /* WA for dev. #4.3 */ - sky2_write16(hw, STAT_TX_IDX_TH, 0xfff); /* Tx Threshold */ - - /* set Status-FIFO watermark */ - sky2_write8(hw, STAT_FIFO_WM, 0x21); /* WA for dev. #4.18 */ + sky2_write16(hw, STAT_TX_IDX_TH, 10); + sky2_write8(hw, STAT_FIFO_WM, 16); - /* set Status-FIFO ISR watermark */ - sky2_write8(hw, STAT_FIFO_ISR_WM, 0x07); /* WA for dev. #4.18 */ - sky2_write32(hw, STAT_TX_TIMER_INI, sky2_us2clk(hw, 10000)); - } else { - sky2_write16(hw, STAT_TX_IDX_TH, 10); - sky2_write8(hw, STAT_FIFO_WM, 16); - - /* set Status-FIFO ISR watermark */ - if (hw->chip_id == CHIP_ID_YUKON_XL && hw->chip_rev == 0) - sky2_write8(hw, STAT_FIFO_ISR_WM, 4); - else - sky2_write8(hw, STAT_FIFO_ISR_WM, 16); + /* set Status-FIFO ISR watermark */ + if (hw->chip_id == CHIP_ID_YUKON_XL && hw->chip_rev == 0) + sky2_write8(hw, STAT_FIFO_ISR_WM, 4); + else + sky2_write8(hw, STAT_FIFO_ISR_WM, 16); - sky2_write32(hw, STAT_TX_TIMER_INI, sky2_us2clk(hw, 1000)); - sky2_write32(hw, STAT_ISR_TIMER_INI, sky2_us2clk(hw, 7)); - } + sky2_write32(hw, STAT_TX_TIMER_INI, sky2_us2clk(hw, 1000)); + sky2_write32(hw, STAT_ISR_TIMER_INI, sky2_us2clk(hw, 20)); + sky2_write32(hw, STAT_LEV_TIMER_INI, sky2_us2clk(hw, 100)); /* enable status unit */ sky2_write32(hw, STAT_CTRL, SC_STAT_OP_ON); @@ -2743,7 +2719,7 @@ static int sky2_phys_id(struct net_device *dev, u32 data) ms = data * 1000; /* save initial values */ - down(&sky2->phy_sema); + spin_lock_bh(&sky2->phy_lock); if (hw->chip_id == CHIP_ID_YUKON_XL) { u16 pg = gm_phy_read(hw, port, PHY_MARV_EXT_ADR); gm_phy_write(hw, port, PHY_MARV_EXT_ADR, 3); @@ -2759,9 +2735,9 @@ static int sky2_phys_id(struct net_device *dev, u32 data) sky2_led(hw, port, onoff); onoff = !onoff; - up(&sky2->phy_sema); + spin_unlock_bh(&sky2->phy_lock); interrupted = msleep_interruptible(250); - down(&sky2->phy_sema); + spin_lock_bh(&sky2->phy_lock); ms -= 250; } @@ -2776,7 +2752,7 @@ static int sky2_phys_id(struct net_device *dev, u32 data) gm_phy_write(hw, port, PHY_MARV_LED_CTRL, ledctrl); gm_phy_write(hw, port, PHY_MARV_LED_OVER, ledover); } - up(&sky2->phy_sema); + spin_unlock_bh(&sky2->phy_lock); return 0; } @@ -2806,38 +2782,6 @@ static int sky2_set_pauseparam(struct net_device *dev, return err; } -#ifdef CONFIG_PM -static void sky2_get_wol(struct net_device *dev, struct ethtool_wolinfo *wol) -{ - struct sky2_port *sky2 = netdev_priv(dev); - - wol->supported = WAKE_MAGIC; - wol->wolopts = sky2->wol ? WAKE_MAGIC : 0; -} - -static int sky2_set_wol(struct net_device *dev, struct ethtool_wolinfo *wol) -{ - struct sky2_port *sky2 = netdev_priv(dev); - struct sky2_hw *hw = sky2->hw; - - if (wol->wolopts != WAKE_MAGIC && wol->wolopts != 0) - return -EOPNOTSUPP; - - sky2->wol = wol->wolopts == WAKE_MAGIC; - - if (sky2->wol) { - memcpy_toio(hw->regs + WOL_MAC_ADDR, dev->dev_addr, ETH_ALEN); - - sky2_write16(hw, WOL_CTRL_STAT, - WOL_CTL_ENA_PME_ON_MAGIC_PKT | - WOL_CTL_ENA_MAGIC_PKT_UNIT); - } else - sky2_write16(hw, WOL_CTRL_STAT, WOL_CTL_DEFAULT); - - return 0; -} -#endif - static int sky2_get_coalesce(struct net_device *dev, struct ethtool_coalesce *ecmd) { @@ -2878,19 +2822,11 @@ static int sky2_set_coalesce(struct net_device *dev, { struct sky2_port *sky2 = netdev_priv(dev); struct sky2_hw *hw = sky2->hw; - const u32 tmin = sky2_clk2us(hw, 1); - const u32 tmax = 5000; - - if (ecmd->tx_coalesce_usecs != 0 && - (ecmd->tx_coalesce_usecs < tmin || ecmd->tx_coalesce_usecs > tmax)) - return -EINVAL; - - if (ecmd->rx_coalesce_usecs != 0 && - (ecmd->rx_coalesce_usecs < tmin || ecmd->rx_coalesce_usecs > tmax)) - return -EINVAL; + const u32 tmax = sky2_clk2us(hw, 0x0ffffff); - if (ecmd->rx_coalesce_usecs_irq != 0 && - (ecmd->rx_coalesce_usecs_irq < tmin || ecmd->rx_coalesce_usecs_irq > tmax)) + if (ecmd->tx_coalesce_usecs > tmax || + ecmd->rx_coalesce_usecs > tmax || + ecmd->rx_coalesce_usecs_irq > tmax) return -EINVAL; if (ecmd->tx_max_coalesced_frames >= TX_RING_SIZE-1) @@ -3025,10 +2961,6 @@ static struct ethtool_ops sky2_ethtool_ops = { .set_ringparam = sky2_set_ringparam, .get_pauseparam = sky2_get_pauseparam, .set_pauseparam = sky2_set_pauseparam, -#ifdef CONFIG_PM - .get_wol = sky2_get_wol, - .set_wol = sky2_set_wol, -#endif .phys_id = sky2_phys_id, .get_stats_count = sky2_get_stats_count, .get_ethtool_stats = sky2_get_ethtool_stats, @@ -3082,16 +3014,15 @@ static __devinit struct net_device *sky2_init_netdev(struct sky2_hw *hw, sky2->speed = -1; sky2->advertising = sky2_supported_modes(hw); - /* Receive checksum disabled for Yukon XL + /* Receive checksum disabled for Yukon XL * because of observed problems with incorrect * values when multiple packets are received in one interrupt */ sky2->rx_csum = (hw->chip_id != CHIP_ID_YUKON_XL); - INIT_WORK(&sky2->phy_task, sky2_phy_task, sky2); - init_MUTEX(&sky2->phy_sema); + spin_lock_init(&sky2->phy_lock); sky2->tx_pending = TX_DEF_PENDING; - sky2->rx_pending = is_ec_a1(hw) ? 8 : RX_DEF_PENDING; + sky2->rx_pending = RX_DEF_PENDING; sky2->rx_bufsize = sky2_buf_size(ETH_DATA_LEN); hw->dev[port] = dev; @@ -3133,6 +3064,66 @@ static void __devinit sky2_show_addr(struct net_device *dev) dev->dev_addr[3], dev->dev_addr[4], dev->dev_addr[5]); } +/* Handle software interrupt used during MSI test */ +static irqreturn_t __devinit sky2_test_intr(int irq, void *dev_id, + struct pt_regs *regs) +{ + struct sky2_hw *hw = dev_id; + u32 status = sky2_read32(hw, B0_Y2_SP_ISRC2); + + if (status == 0) + return IRQ_NONE; + + if (status & Y2_IS_IRQ_SW) { + hw->msi_detected = 1; + wake_up(&hw->msi_wait); + sky2_write8(hw, B0_CTST, CS_CL_SW_IRQ); + } + sky2_write32(hw, B0_Y2_SP_ICR, 2); + + return IRQ_HANDLED; +} + +/* Test interrupt path by forcing a a software IRQ */ +static int __devinit sky2_test_msi(struct sky2_hw *hw) +{ + struct pci_dev *pdev = hw->pdev; + int err; + + sky2_write32(hw, B0_IMSK, Y2_IS_IRQ_SW); + + err = request_irq(pdev->irq, sky2_test_intr, SA_SHIRQ, DRV_NAME, hw); + if (err) { + printk(KERN_ERR PFX "%s: cannot assign irq %d\n", + pci_name(pdev), pdev->irq); + return err; + } + + init_waitqueue_head (&hw->msi_wait); + + sky2_write8(hw, B0_CTST, CS_ST_SW_IRQ); + wmb(); + + wait_event_timeout(hw->msi_wait, hw->msi_detected, HZ/10); + + if (!hw->msi_detected) { + /* MSI test failed, go back to INTx mode */ + printk(KERN_WARNING PFX "%s: No interrupt was generated using MSI, " + "switching to INTx mode. Please report this failure to " + "the PCI maintainer and include system chipset information.\n", + pci_name(pdev)); + + err = -EOPNOTSUPP; + sky2_write8(hw, B0_CTST, CS_CL_SW_IRQ); + } + + sky2_write32(hw, B0_IMSK, 0); + + free_irq(pdev->irq, hw); + + return err; +} + static int __devinit sky2_probe(struct pci_dev *pdev, const struct pci_device_id *ent) { @@ -3201,7 +3192,6 @@ static int __devinit sky2_probe(struct pci_dev *pdev, goto err_out_free_hw; } hw->pm_cap = pm_cap; - spin_lock_init(&hw->hw_lock); #ifdef __BIG_ENDIAN /* byte swap descriptors in hardware */ @@ -3254,21 +3244,29 @@ static int __devinit sky2_probe(struct pci_dev *pdev, } } - err = request_irq(pdev->irq, sky2_intr, SA_SHIRQ, DRV_NAME, hw); + if (!disable_msi && pci_enable_msi(pdev) == 0) { + err = sky2_test_msi(hw); + if (err == -EOPNOTSUPP) + pci_disable_msi(pdev); + else if (err) + goto err_out_unregister; + } + + err = request_irq(pdev->irq, sky2_intr, SA_SHIRQ, DRV_NAME, hw); if (err) { printk(KERN_ERR PFX "%s: cannot assign irq %d\n", pci_name(pdev), pdev->irq); goto err_out_unregister; } - hw->intr_mask = Y2_IS_BASE; - sky2_write32(hw, B0_IMSK, hw->intr_mask); + sky2_write32(hw, B0_IMSK, Y2_IS_BASE); pci_set_drvdata(pdev, hw); return 0; err_out_unregister: + pci_disable_msi(pdev); if (dev1) { unregister_netdev(dev1); free_netdev(dev1); @@ -3311,6 +3309,7 @@ static void __devexit sky2_remove(struct pci_dev *pdev) sky2_read8(hw, B0_CTST); free_irq(pdev->irq, hw); + pci_disable_msi(pdev); pci_free_consistent(pdev, STATUS_LE_BYTES, hw->st_le, hw->st_dma); pci_release_regions(pdev); pci_disable_device(pdev); diff --git a/drivers/net/sky2.h b/drivers/net/sky2.h index dce955c76f3..d63cd5a1b71 100644 --- a/drivers/net/sky2.h +++ b/drivers/net/sky2.h @@ -278,13 +278,11 @@ enum { Y2_IS_CHK_TXS1 = 1<<1, /* Descriptor error TXS 1 */ Y2_IS_CHK_TXA1 = 1<<0, /* Descriptor error TXA 1 */ - Y2_IS_BASE = Y2_IS_HW_ERR | Y2_IS_STAT_BMU | - Y2_IS_POLL_CHK | Y2_IS_TWSI_RDY | - Y2_IS_IRQ_SW | Y2_IS_TIMINT, - Y2_IS_PORT_1 = Y2_IS_IRQ_PHY1 | Y2_IS_IRQ_MAC1 | - Y2_IS_CHK_RX1 | Y2_IS_CHK_TXA1 | Y2_IS_CHK_TXS1, - Y2_IS_PORT_2 = Y2_IS_IRQ_PHY2 | Y2_IS_IRQ_MAC2 | - Y2_IS_CHK_RX2 | Y2_IS_CHK_TXA2 | Y2_IS_CHK_TXS2, + Y2_IS_BASE = Y2_IS_HW_ERR | Y2_IS_STAT_BMU, + Y2_IS_PORT_1 = Y2_IS_IRQ_PHY1 | Y2_IS_IRQ_MAC1 + | Y2_IS_CHK_TXA1 | Y2_IS_CHK_RX1, + Y2_IS_PORT_2 = Y2_IS_IRQ_PHY2 | Y2_IS_IRQ_MAC2 + | Y2_IS_CHK_TXA2 | Y2_IS_CHK_RX2, }; /* B2_IRQM_HWE_MSK 32 bit IRQ Moderation HW Error Mask */ @@ -1832,6 +1830,7 @@ struct sky2_port { struct net_device *netdev; unsigned port; u32 msg_enable; + spinlock_t phy_lock; spinlock_t tx_lock ____cacheline_aligned_in_smp; struct tx_ring_info *tx_ring; @@ -1840,7 +1839,6 @@ struct sky2_port { u16 tx_prod; /* next le to use */ u32 tx_addr64; u16 tx_pending; - u16 tx_last_put; u16 tx_last_mss; struct ring_info *rx_ring ____cacheline_aligned_in_smp; @@ -1849,7 +1847,6 @@ struct sky2_port { u16 rx_next; /* next re to check */ u16 rx_put; /* next le index to use */ u16 rx_pending; - u16 rx_last_put; u16 rx_bufsize; #ifdef SKY2_VLAN_TAG_USED u16 rx_tag; @@ -1865,20 +1862,15 @@ struct sky2_port { u8 rx_pause; u8 tx_pause; u8 rx_csum; - u8 wol; struct net_device_stats net_stats; - struct work_struct phy_task; - struct semaphore phy_sema; }; struct sky2_hw { void __iomem *regs; struct pci_dev *pdev; struct net_device *dev[2]; - spinlock_t hw_lock; - u32 intr_mask; int pm_cap; u8 chip_id; @@ -1889,6 +1881,8 @@ struct sky2_hw { struct sky2_status_le *st_le; u32 st_idx; dma_addr_t st_dma; + int msi_detected; + wait_queue_head_t msi_wait; }; /* Register accessor for memory mapped device */ diff --git a/drivers/net/smc91x.c b/drivers/net/smc91x.c index 75e9b3b910c..0e9833adf9f 100644 --- a/drivers/net/smc91x.c +++ b/drivers/net/smc91x.c @@ -215,15 +215,12 @@ struct smc_local { spinlock_t lock; -#ifdef SMC_CAN_USE_DATACS - u32 __iomem *datacs; -#endif - #ifdef SMC_USE_PXA_DMA /* DMA needs the physical address of the chip */ u_long physaddr; #endif void __iomem *base; + void __iomem *datacs; }; #if SMC_DEBUG > 0 @@ -2104,9 +2101,8 @@ static int smc_enable_device(struct platform_device *pdev) * Set the appropriate byte/word mode. */ ecsr = readb(addr + (ECSR << SMC_IO_SHIFT)) & ~ECSR_IOIS8; -#ifndef SMC_CAN_USE_16BIT - ecsr |= ECSR_IOIS8; -#endif + if (!SMC_CAN_USE_16BIT) + ecsr |= ECSR_IOIS8; writeb(ecsr, addr + (ECSR << SMC_IO_SHIFT)); local_irq_restore(flags); @@ -2143,40 +2139,39 @@ static void smc_release_attrib(struct platform_device *pdev) release_mem_region(res->start, ATTRIB_SIZE); } -#ifdef SMC_CAN_USE_DATACS -static void smc_request_datacs(struct platform_device *pdev, struct net_device *ndev) +static inline void smc_request_datacs(struct platform_device *pdev, struct net_device *ndev) { - struct resource * res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "smc91x-data32"); - struct smc_local *lp = netdev_priv(ndev); + if (SMC_CAN_USE_DATACS) { + struct resource * res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "smc91x-data32"); + struct smc_local *lp = netdev_priv(ndev); - if (!res) - return; + if (!res) + return; - if(!request_mem_region(res->start, SMC_DATA_EXTENT, CARDNAME)) { - printk(KERN_INFO "%s: failed to request datacs memory region.\n", CARDNAME); - return; - } + if(!request_mem_region(res->start, SMC_DATA_EXTENT, CARDNAME)) { + printk(KERN_INFO "%s: failed to request datacs memory region.\n", CARDNAME); + return; + } - lp->datacs = ioremap(res->start, SMC_DATA_EXTENT); + lp->datacs = ioremap(res->start, SMC_DATA_EXTENT); + } } static void smc_release_datacs(struct platform_device *pdev, struct net_device *ndev) { - struct smc_local *lp = netdev_priv(ndev); - struct resource * res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "smc91x-data32"); + if (SMC_CAN_USE_DATACS) { + struct smc_local *lp = netdev_priv(ndev); + struct resource * res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "smc91x-data32"); - if (lp->datacs) - iounmap(lp->datacs); + if (lp->datacs) + iounmap(lp->datacs); - lp->datacs = NULL; + lp->datacs = NULL; - if (res) - release_mem_region(res->start, SMC_DATA_EXTENT); + if (res) + release_mem_region(res->start, SMC_DATA_EXTENT); + } } -#else -static void smc_request_datacs(struct platform_device *pdev, struct net_device *ndev) {} -static void smc_release_datacs(struct platform_device *pdev, struct net_device *ndev) {} -#endif /* * smc_init(void) diff --git a/drivers/net/smc91x.h b/drivers/net/smc91x.h index e0efd1964e7..e1be1af5120 100644 --- a/drivers/net/smc91x.h +++ b/drivers/net/smc91x.h @@ -275,7 +275,10 @@ SMC_outw(u16 val, void __iomem *ioaddr, int reg) #define SMC_insw(a,r,p,l) readsw ((void*) ((a) + (r)), p, l) #define SMC_outw(v,a,r) ({ writew ((v), (a) + (r)); LPD7A40X_IOBARRIER; }) -static inline void SMC_outsw (unsigned long a, int r, unsigned char* p, int l) +#define SMC_outsw LPD7A40X_SMC_outsw + +static inline void LPD7A40X_SMC_outsw(unsigned long a, int r, + unsigned char* p, int l) { unsigned short* ps = (unsigned short*) p; while (l-- > 0) { @@ -342,10 +345,6 @@ static inline void SMC_outsw (unsigned long a, int r, unsigned char* p, int l) #endif -#ifndef SMC_IRQ_FLAGS -#define SMC_IRQ_FLAGS SA_TRIGGER_RISING -#endif - #ifdef SMC_USE_PXA_DMA /* * Let's use the DMA engine on the XScale PXA2xx for RX packets. This is @@ -441,10 +440,85 @@ smc_pxa_dma_irq(int dma, void *dummy, struct pt_regs *regs) #endif /* SMC_USE_PXA_DMA */ -/* Because of bank switching, the LAN91x uses only 16 I/O ports */ +/* + * Everything a particular hardware setup needs should have been defined + * at this point. Add stubs for the undefined cases, mainly to avoid + * compilation warnings since they'll be optimized away, or to prevent buggy + * use of them. + */ + +#if ! SMC_CAN_USE_32BIT +#define SMC_inl(ioaddr, reg) ({ BUG(); 0; }) +#define SMC_outl(x, ioaddr, reg) BUG() +#define SMC_insl(a, r, p, l) BUG() +#define SMC_outsl(a, r, p, l) BUG() +#endif + +#if !defined(SMC_insl) || !defined(SMC_outsl) +#define SMC_insl(a, r, p, l) BUG() +#define SMC_outsl(a, r, p, l) BUG() +#endif + +#if ! SMC_CAN_USE_16BIT + +/* + * Any 16-bit access is performed with two 8-bit accesses if the hardware + * can't do it directly. Most registers are 16-bit so those are mandatory. + */ +#define SMC_outw(x, ioaddr, reg) \ + do { \ + unsigned int __val16 = (x); \ + SMC_outb( __val16, ioaddr, reg ); \ + SMC_outb( __val16 >> 8, ioaddr, reg + (1 << SMC_IO_SHIFT));\ + } while (0) +#define SMC_inw(ioaddr, reg) \ + ({ \ + unsigned int __val16; \ + __val16 = SMC_inb( ioaddr, reg ); \ + __val16 |= SMC_inb( ioaddr, reg + (1 << SMC_IO_SHIFT)) << 8; \ + __val16; \ + }) + +#define SMC_insw(a, r, p, l) BUG() +#define SMC_outsw(a, r, p, l) BUG() + +#endif + +#if !defined(SMC_insw) || !defined(SMC_outsw) +#define SMC_insw(a, r, p, l) BUG() +#define SMC_outsw(a, r, p, l) BUG() +#endif + +#if ! SMC_CAN_USE_8BIT +#define SMC_inb(ioaddr, reg) ({ BUG(); 0; }) +#define SMC_outb(x, ioaddr, reg) BUG() +#define SMC_insb(a, r, p, l) BUG() +#define SMC_outsb(a, r, p, l) BUG() +#endif + +#if !defined(SMC_insb) || !defined(SMC_outsb) +#define SMC_insb(a, r, p, l) BUG() +#define SMC_outsb(a, r, p, l) BUG() +#endif + +#ifndef SMC_CAN_USE_DATACS +#define SMC_CAN_USE_DATACS 0 +#endif + #ifndef SMC_IO_SHIFT #define SMC_IO_SHIFT 0 #endif + +#ifndef SMC_IRQ_FLAGS +#define SMC_IRQ_FLAGS SA_TRIGGER_RISING +#endif + +#ifndef SMC_INTERRUPT_PREAMBLE +#define SMC_INTERRUPT_PREAMBLE +#endif + + +/* Because of bank switching, the LAN91x uses only 16 I/O ports */ #define SMC_IO_EXTENT (16 << SMC_IO_SHIFT) #define SMC_DATA_EXTENT (4) @@ -817,6 +891,11 @@ static const char * chip_ids[ 16 ] = { * Note: the following macros do *not* select the bank -- this must * be done separately as needed in the main code. The SMC_REG() macro * only uses the bank argument for debugging purposes (when enabled). + * + * Note: despite inline functions being safer, everything leading to this + * should preferably be macros to let BUG() display the line number in + * the core source code since we're interested in the top call site + * not in any inline function location. */ #if SMC_DEBUG > 0 @@ -834,62 +913,142 @@ static const char * chip_ids[ 16 ] = { #define SMC_REG(reg, bank) (reg<<SMC_IO_SHIFT) #endif -#if SMC_CAN_USE_8BIT -#define SMC_GET_PN() SMC_inb( ioaddr, PN_REG ) -#define SMC_SET_PN(x) SMC_outb( x, ioaddr, PN_REG ) -#define SMC_GET_AR() SMC_inb( ioaddr, AR_REG ) -#define SMC_GET_TXFIFO() SMC_inb( ioaddr, TXFIFO_REG ) -#define SMC_GET_RXFIFO() SMC_inb( ioaddr, RXFIFO_REG ) -#define SMC_GET_INT() SMC_inb( ioaddr, INT_REG ) -#define SMC_ACK_INT(x) SMC_outb( x, ioaddr, INT_REG ) -#define SMC_GET_INT_MASK() SMC_inb( ioaddr, IM_REG ) -#define SMC_SET_INT_MASK(x) SMC_outb( x, ioaddr, IM_REG ) -#else -#define SMC_GET_PN() (SMC_inw( ioaddr, PN_REG ) & 0xFF) -#define SMC_SET_PN(x) SMC_outw( x, ioaddr, PN_REG ) -#define SMC_GET_AR() (SMC_inw( ioaddr, PN_REG ) >> 8) -#define SMC_GET_TXFIFO() (SMC_inw( ioaddr, TXFIFO_REG ) & 0xFF) -#define SMC_GET_RXFIFO() (SMC_inw( ioaddr, TXFIFO_REG ) >> 8) -#define SMC_GET_INT() (SMC_inw( ioaddr, INT_REG ) & 0xFF) +/* + * Hack Alert: Some setups just can't write 8 or 16 bits reliably when not + * aligned to a 32 bit boundary. I tell you that does exist! + * Fortunately the affected register accesses can be easily worked around + * since we can write zeroes to the preceeding 16 bits without adverse + * effects and use a 32-bit access. + * + * Enforce it on any 32-bit capable setup for now. + */ +#define SMC_MUST_ALIGN_WRITE SMC_CAN_USE_32BIT + +#define SMC_GET_PN() \ + ( SMC_CAN_USE_8BIT ? (SMC_inb(ioaddr, PN_REG)) \ + : (SMC_inw(ioaddr, PN_REG) & 0xFF) ) + +#define SMC_SET_PN(x) \ + do { \ + if (SMC_MUST_ALIGN_WRITE) \ + SMC_outl((x)<<16, ioaddr, SMC_REG(0, 2)); \ + else if (SMC_CAN_USE_8BIT) \ + SMC_outb(x, ioaddr, PN_REG); \ + else \ + SMC_outw(x, ioaddr, PN_REG); \ + } while (0) + +#define SMC_GET_AR() \ + ( SMC_CAN_USE_8BIT ? (SMC_inb(ioaddr, AR_REG)) \ + : (SMC_inw(ioaddr, PN_REG) >> 8) ) + +#define SMC_GET_TXFIFO() \ + ( SMC_CAN_USE_8BIT ? (SMC_inb(ioaddr, TXFIFO_REG)) \ + : (SMC_inw(ioaddr, TXFIFO_REG) & 0xFF) ) + +#define SMC_GET_RXFIFO() \ + ( SMC_CAN_USE_8BIT ? (SMC_inb(ioaddr, RXFIFO_REG)) \ + : (SMC_inw(ioaddr, TXFIFO_REG) >> 8) ) + +#define SMC_GET_INT() \ + ( SMC_CAN_USE_8BIT ? (SMC_inb(ioaddr, INT_REG)) \ + : (SMC_inw(ioaddr, INT_REG) & 0xFF) ) + #define SMC_ACK_INT(x) \ do { \ - unsigned long __flags; \ - int __mask; \ - local_irq_save(__flags); \ - __mask = SMC_inw( ioaddr, INT_REG ) & ~0xff; \ - SMC_outw( __mask | (x), ioaddr, INT_REG ); \ - local_irq_restore(__flags); \ + if (SMC_CAN_USE_8BIT) \ + SMC_outb(x, ioaddr, INT_REG); \ + else { \ + unsigned long __flags; \ + int __mask; \ + local_irq_save(__flags); \ + __mask = SMC_inw( ioaddr, INT_REG ) & ~0xff; \ + SMC_outw( __mask | (x), ioaddr, INT_REG ); \ + local_irq_restore(__flags); \ + } \ + } while (0) + +#define SMC_GET_INT_MASK() \ + ( SMC_CAN_USE_8BIT ? (SMC_inb(ioaddr, IM_REG)) \ + : (SMC_inw( ioaddr, INT_REG ) >> 8) ) + +#define SMC_SET_INT_MASK(x) \ + do { \ + if (SMC_CAN_USE_8BIT) \ + SMC_outb(x, ioaddr, IM_REG); \ + else \ + SMC_outw((x) << 8, ioaddr, INT_REG); \ + } while (0) + +#define SMC_CURRENT_BANK() SMC_inw(ioaddr, BANK_SELECT) + +#define SMC_SELECT_BANK(x) \ + do { \ + if (SMC_MUST_ALIGN_WRITE) \ + SMC_outl((x)<<16, ioaddr, 12<<SMC_IO_SHIFT); \ + else \ + SMC_outw(x, ioaddr, BANK_SELECT); \ + } while (0) + +#define SMC_GET_BASE() SMC_inw(ioaddr, BASE_REG) + +#define SMC_SET_BASE(x) SMC_outw(x, ioaddr, BASE_REG) + +#define SMC_GET_CONFIG() SMC_inw(ioaddr, CONFIG_REG) + +#define SMC_SET_CONFIG(x) SMC_outw(x, ioaddr, CONFIG_REG) + +#define SMC_GET_COUNTER() SMC_inw(ioaddr, COUNTER_REG) + +#define SMC_GET_CTL() SMC_inw(ioaddr, CTL_REG) + +#define SMC_SET_CTL(x) SMC_outw(x, ioaddr, CTL_REG) + +#define SMC_GET_MII() SMC_inw(ioaddr, MII_REG) + +#define SMC_SET_MII(x) SMC_outw(x, ioaddr, MII_REG) + +#define SMC_GET_MIR() SMC_inw(ioaddr, MIR_REG) + +#define SMC_SET_MIR(x) SMC_outw(x, ioaddr, MIR_REG) + +#define SMC_GET_MMU_CMD() SMC_inw(ioaddr, MMU_CMD_REG) + +#define SMC_SET_MMU_CMD(x) SMC_outw(x, ioaddr, MMU_CMD_REG) + +#define SMC_GET_FIFO() SMC_inw(ioaddr, FIFO_REG) + +#define SMC_GET_PTR() SMC_inw(ioaddr, PTR_REG) + +#define SMC_SET_PTR(x) \ + do { \ + if (SMC_MUST_ALIGN_WRITE) \ + SMC_outl((x)<<16, ioaddr, SMC_REG(4, 2)); \ + else \ + SMC_outw(x, ioaddr, PTR_REG); \ } while (0) -#define SMC_GET_INT_MASK() (SMC_inw( ioaddr, INT_REG ) >> 8) -#define SMC_SET_INT_MASK(x) SMC_outw( (x) << 8, ioaddr, INT_REG ) -#endif -#define SMC_CURRENT_BANK() SMC_inw( ioaddr, BANK_SELECT ) -#define SMC_SELECT_BANK(x) SMC_outw( x, ioaddr, BANK_SELECT ) -#define SMC_GET_BASE() SMC_inw( ioaddr, BASE_REG ) -#define SMC_SET_BASE(x) SMC_outw( x, ioaddr, BASE_REG ) -#define SMC_GET_CONFIG() SMC_inw( ioaddr, CONFIG_REG ) -#define SMC_SET_CONFIG(x) SMC_outw( x, ioaddr, CONFIG_REG ) -#define SMC_GET_COUNTER() SMC_inw( ioaddr, COUNTER_REG ) -#define SMC_GET_CTL() SMC_inw( ioaddr, CTL_REG ) -#define SMC_SET_CTL(x) SMC_outw( x, ioaddr, CTL_REG ) -#define SMC_GET_MII() SMC_inw( ioaddr, MII_REG ) -#define SMC_SET_MII(x) SMC_outw( x, ioaddr, MII_REG ) -#define SMC_GET_MIR() SMC_inw( ioaddr, MIR_REG ) -#define SMC_SET_MIR(x) SMC_outw( x, ioaddr, MIR_REG ) -#define SMC_GET_MMU_CMD() SMC_inw( ioaddr, MMU_CMD_REG ) -#define SMC_SET_MMU_CMD(x) SMC_outw( x, ioaddr, MMU_CMD_REG ) -#define SMC_GET_FIFO() SMC_inw( ioaddr, FIFO_REG ) -#define SMC_GET_PTR() SMC_inw( ioaddr, PTR_REG ) -#define SMC_SET_PTR(x) SMC_outw( x, ioaddr, PTR_REG ) -#define SMC_GET_EPH_STATUS() SMC_inw( ioaddr, EPH_STATUS_REG ) -#define SMC_GET_RCR() SMC_inw( ioaddr, RCR_REG ) -#define SMC_SET_RCR(x) SMC_outw( x, ioaddr, RCR_REG ) -#define SMC_GET_REV() SMC_inw( ioaddr, REV_REG ) -#define SMC_GET_RPC() SMC_inw( ioaddr, RPC_REG ) -#define SMC_SET_RPC(x) SMC_outw( x, ioaddr, RPC_REG ) -#define SMC_GET_TCR() SMC_inw( ioaddr, TCR_REG ) -#define SMC_SET_TCR(x) SMC_outw( x, ioaddr, TCR_REG ) +#define SMC_GET_EPH_STATUS() SMC_inw(ioaddr, EPH_STATUS_REG) + +#define SMC_GET_RCR() SMC_inw(ioaddr, RCR_REG) + +#define SMC_SET_RCR(x) SMC_outw(x, ioaddr, RCR_REG) + +#define SMC_GET_REV() SMC_inw(ioaddr, REV_REG) + +#define SMC_GET_RPC() SMC_inw(ioaddr, RPC_REG) + +#define SMC_SET_RPC(x) \ + do { \ + if (SMC_MUST_ALIGN_WRITE) \ + SMC_outl((x)<<16, ioaddr, SMC_REG(8, 0)); \ + else \ + SMC_outw(x, ioaddr, RPC_REG); \ + } while (0) + +#define SMC_GET_TCR() SMC_inw(ioaddr, TCR_REG) + +#define SMC_SET_TCR(x) SMC_outw(x, ioaddr, TCR_REG) #ifndef SMC_GET_MAC_ADDR #define SMC_GET_MAC_ADDR(addr) \ @@ -920,151 +1079,84 @@ static const char * chip_ids[ 16 ] = { SMC_outw( mt[6] | (mt[7] << 8), ioaddr, MCAST_REG4 ); \ } while (0) -#if SMC_CAN_USE_32BIT -/* - * Some setups just can't write 8 or 16 bits reliably when not aligned - * to a 32 bit boundary. I tell you that exists! - * We re-do the ones here that can be easily worked around if they can have - * their low parts written to 0 without adverse effects. - */ -#undef SMC_SELECT_BANK -#define SMC_SELECT_BANK(x) SMC_outl( (x)<<16, ioaddr, 12<<SMC_IO_SHIFT ) -#undef SMC_SET_RPC -#define SMC_SET_RPC(x) SMC_outl( (x)<<16, ioaddr, SMC_REG(8, 0) ) -#undef SMC_SET_PN -#define SMC_SET_PN(x) SMC_outl( (x)<<16, ioaddr, SMC_REG(0, 2) ) -#undef SMC_SET_PTR -#define SMC_SET_PTR(x) SMC_outl( (x)<<16, ioaddr, SMC_REG(4, 2) ) -#endif - -#if SMC_CAN_USE_32BIT -#define SMC_PUT_PKT_HDR(status, length) \ - SMC_outl( (status) | (length) << 16, ioaddr, DATA_REG ) -#define SMC_GET_PKT_HDR(status, length) \ - do { \ - unsigned int __val = SMC_inl( ioaddr, DATA_REG ); \ - (status) = __val & 0xffff; \ - (length) = __val >> 16; \ - } while (0) -#else #define SMC_PUT_PKT_HDR(status, length) \ do { \ - SMC_outw( status, ioaddr, DATA_REG ); \ - SMC_outw( length, ioaddr, DATA_REG ); \ - } while (0) -#define SMC_GET_PKT_HDR(status, length) \ - do { \ - (status) = SMC_inw( ioaddr, DATA_REG ); \ - (length) = SMC_inw( ioaddr, DATA_REG ); \ + if (SMC_CAN_USE_32BIT) \ + SMC_outl((status) | (length)<<16, ioaddr, DATA_REG); \ + else { \ + SMC_outw(status, ioaddr, DATA_REG); \ + SMC_outw(length, ioaddr, DATA_REG); \ + } \ } while (0) -#endif -#if SMC_CAN_USE_32BIT -#define _SMC_PUSH_DATA(p, l) \ +#define SMC_GET_PKT_HDR(status, length) \ do { \ - char *__ptr = (p); \ - int __len = (l); \ - if (__len >= 2 && (unsigned long)__ptr & 2) { \ - __len -= 2; \ - SMC_outw( *(u16 *)__ptr, ioaddr, DATA_REG ); \ - __ptr += 2; \ - } \ - SMC_outsl( ioaddr, DATA_REG, __ptr, __len >> 2); \ - if (__len & 2) { \ - __ptr += (__len & ~3); \ - SMC_outw( *((u16 *)__ptr), ioaddr, DATA_REG ); \ + if (SMC_CAN_USE_32BIT) { \ + unsigned int __val = SMC_inl(ioaddr, DATA_REG); \ + (status) = __val & 0xffff; \ + (length) = __val >> 16; \ + } else { \ + (status) = SMC_inw(ioaddr, DATA_REG); \ + (length) = SMC_inw(ioaddr, DATA_REG); \ } \ } while (0) -#define _SMC_PULL_DATA(p, l) \ - do { \ - char *__ptr = (p); \ - int __len = (l); \ - if ((unsigned long)__ptr & 2) { \ - /* \ - * We want 32bit alignment here. \ - * Since some buses perform a full 32bit \ - * fetch even for 16bit data we can't use \ - * SMC_inw() here. Back both source (on chip \ - * and destination) pointers of 2 bytes. \ - */ \ - __ptr -= 2; \ - __len += 2; \ - SMC_SET_PTR( 2|PTR_READ|PTR_RCV|PTR_AUTOINC ); \ - } \ - __len += 2; \ - SMC_insl( ioaddr, DATA_REG, __ptr, __len >> 2); \ - } while (0) -#elif SMC_CAN_USE_16BIT -#define _SMC_PUSH_DATA(p, l) SMC_outsw( ioaddr, DATA_REG, p, (l) >> 1 ) -#define _SMC_PULL_DATA(p, l) SMC_insw ( ioaddr, DATA_REG, p, (l) >> 1 ) -#elif SMC_CAN_USE_8BIT -#define _SMC_PUSH_DATA(p, l) SMC_outsb( ioaddr, DATA_REG, p, l ) -#define _SMC_PULL_DATA(p, l) SMC_insb ( ioaddr, DATA_REG, p, l ) -#endif -#if ! SMC_CAN_USE_16BIT -#define SMC_outw(x, ioaddr, reg) \ +#define SMC_PUSH_DATA(p, l) \ do { \ - unsigned int __val16 = (x); \ - SMC_outb( __val16, ioaddr, reg ); \ - SMC_outb( __val16 >> 8, ioaddr, reg + (1 << SMC_IO_SHIFT));\ + if (SMC_CAN_USE_32BIT) { \ + void *__ptr = (p); \ + int __len = (l); \ + void *__ioaddr = ioaddr; \ + if (__len >= 2 && (unsigned long)__ptr & 2) { \ + __len -= 2; \ + SMC_outw(*(u16 *)__ptr, ioaddr, DATA_REG); \ + __ptr += 2; \ + } \ + if (SMC_CAN_USE_DATACS && lp->datacs) \ + __ioaddr = lp->datacs; \ + SMC_outsl(__ioaddr, DATA_REG, __ptr, __len>>2); \ + if (__len & 2) { \ + __ptr += (__len & ~3); \ + SMC_outw(*((u16 *)__ptr), ioaddr, DATA_REG); \ + } \ + } else if (SMC_CAN_USE_16BIT) \ + SMC_outsw(ioaddr, DATA_REG, p, (l) >> 1); \ + else if (SMC_CAN_USE_8BIT) \ + SMC_outsb(ioaddr, DATA_REG, p, l); \ } while (0) -#define SMC_inw(ioaddr, reg) \ - ({ \ - unsigned int __val16; \ - __val16 = SMC_inb( ioaddr, reg ); \ - __val16 |= SMC_inb( ioaddr, reg + (1 << SMC_IO_SHIFT)) << 8; \ - __val16; \ - }) -#endif - -#ifdef SMC_CAN_USE_DATACS -#define SMC_PUSH_DATA(p, l) \ - if ( lp->datacs ) { \ - unsigned char *__ptr = (p); \ - int __len = (l); \ - if (__len >= 2 && (unsigned long)__ptr & 2) { \ - __len -= 2; \ - SMC_outw( *((u16 *)__ptr), ioaddr, DATA_REG ); \ - __ptr += 2; \ - } \ - outsl(lp->datacs, __ptr, __len >> 2); \ - if (__len & 2) { \ - __ptr += (__len & ~3); \ - SMC_outw( *((u16 *)__ptr), ioaddr, DATA_REG ); \ - } \ - } else { \ - _SMC_PUSH_DATA(p, l); \ - } #define SMC_PULL_DATA(p, l) \ - if ( lp->datacs ) { \ - unsigned char *__ptr = (p); \ - int __len = (l); \ - if ((unsigned long)__ptr & 2) { \ - /* \ - * We want 32bit alignment here. \ - * Since some buses perform a full 32bit \ - * fetch even for 16bit data we can't use \ - * SMC_inw() here. Back both source (on chip \ - * and destination) pointers of 2 bytes. \ - */ \ - __ptr -= 2; \ + do { \ + if (SMC_CAN_USE_32BIT) { \ + void *__ptr = (p); \ + int __len = (l); \ + void *__ioaddr = ioaddr; \ + if ((unsigned long)__ptr & 2) { \ + /* \ + * We want 32bit alignment here. \ + * Since some buses perform a full \ + * 32bit fetch even for 16bit data \ + * we can't use SMC_inw() here. \ + * Back both source (on-chip) and \ + * destination pointers of 2 bytes. \ + * This is possible since the call to \ + * SMC_GET_PKT_HDR() already advanced \ + * the source pointer of 4 bytes, and \ + * the skb_reserve(skb, 2) advanced \ + * the destination pointer of 2 bytes. \ + */ \ + __ptr -= 2; \ + __len += 2; \ + SMC_SET_PTR(2|PTR_READ|PTR_RCV|PTR_AUTOINC); \ + } \ + if (SMC_CAN_USE_DATACS && lp->datacs) \ + __ioaddr = lp->datacs; \ __len += 2; \ - SMC_SET_PTR( 2|PTR_READ|PTR_RCV|PTR_AUTOINC ); \ - } \ - __len += 2; \ - insl( lp->datacs, __ptr, __len >> 2); \ - } else { \ - _SMC_PULL_DATA(p, l); \ - } -#else -#define SMC_PUSH_DATA(p, l) _SMC_PUSH_DATA(p, l) -#define SMC_PULL_DATA(p, l) _SMC_PULL_DATA(p, l) -#endif - -#if !defined (SMC_INTERRUPT_PREAMBLE) -# define SMC_INTERRUPT_PREAMBLE -#endif + SMC_insl(__ioaddr, DATA_REG, __ptr, __len>>2); \ + } else if (SMC_CAN_USE_16BIT) \ + SMC_insw(ioaddr, DATA_REG, p, (l) >> 1); \ + else if (SMC_CAN_USE_8BIT) \ + SMC_insb(ioaddr, DATA_REG, p, l); \ + } while (0) #endif /* _SMC91X_H_ */ diff --git a/drivers/scsi/iscsi_tcp.c b/drivers/scsi/iscsi_tcp.c index ff79e68b347..7b82ff090d4 100644 --- a/drivers/scsi/iscsi_tcp.c +++ b/drivers/scsi/iscsi_tcp.c @@ -3639,7 +3639,7 @@ iscsi_tcp_init(void) taskcache = kmem_cache_create("iscsi_taskcache", sizeof(struct iscsi_data_task), 0, - SLAB_HWCACHE_ALIGN | SLAB_NO_REAP, NULL, NULL); + SLAB_HWCACHE_ALIGN, NULL, NULL); if (!taskcache) return -ENOMEM; diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index a8b05ce5de5..7405d0df95d 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c @@ -1139,32 +1139,6 @@ sg_fasync(int fd, struct file *filp, int mode) return (retval < 0) ? retval : 0; } -/* When startFinish==1 increments page counts for pages other than the - first of scatter gather elements obtained from alloc_pages(). - When startFinish==0 decrements ... */ -static void -sg_rb_correct4mmap(Sg_scatter_hold * rsv_schp, int startFinish) -{ - struct scatterlist *sg = rsv_schp->buffer; - struct page *page; - int k, m; - - SCSI_LOG_TIMEOUT(3, printk("sg_rb_correct4mmap: startFinish=%d, scatg=%d\n", - startFinish, rsv_schp->k_use_sg)); - /* N.B. correction _not_ applied to base page of each allocation */ - for (k = 0; k < rsv_schp->k_use_sg; ++k, ++sg) { - for (m = PAGE_SIZE; m < sg->length; m += PAGE_SIZE) { - page = sg->page; - if (startFinish) - get_page(page); - else { - if (page_count(page) > 0) - __put_page(page); - } - } - } -} - static struct page * sg_vma_nopage(struct vm_area_struct *vma, unsigned long addr, int *type) { @@ -1236,10 +1210,7 @@ sg_mmap(struct file *filp, struct vm_area_struct *vma) sa += len; } - if (0 == sfp->mmap_called) { - sg_rb_correct4mmap(rsv_schp, 1); /* do only once per fd lifetime */ - sfp->mmap_called = 1; - } + sfp->mmap_called = 1; vma->vm_flags |= VM_RESERVED; vma->vm_private_data = sfp; vma->vm_ops = &sg_mmap_vm_ops; @@ -2388,8 +2359,6 @@ __sg_remove_sfp(Sg_device * sdp, Sg_fd * sfp) SCSI_LOG_TIMEOUT(6, printk("__sg_remove_sfp: bufflen=%d, k_use_sg=%d\n", (int) sfp->reserve.bufflen, (int) sfp->reserve.k_use_sg)); - if (sfp->mmap_called) - sg_rb_correct4mmap(&sfp->reserve, 0); /* undo correction */ sg_remove_scat(&sfp->reserve); } sfp->parentdp = NULL; @@ -2471,9 +2440,9 @@ sg_page_malloc(int rqSz, int lowDma, int *retSzp) return resp; if (lowDma) - page_mask = GFP_ATOMIC | GFP_DMA | __GFP_NOWARN; + page_mask = GFP_ATOMIC | GFP_DMA | __GFP_COMP | __GFP_NOWARN; else - page_mask = GFP_ATOMIC | __GFP_NOWARN; + page_mask = GFP_ATOMIC | __GFP_COMP | __GFP_NOWARN; for (order = 0, a_size = PAGE_SIZE; a_size < rqSz; order++, a_size <<= 1) ; diff --git a/drivers/serial/Kconfig b/drivers/serial/Kconfig index 89e5413cc2a..c66ef96c71b 100644 --- a/drivers/serial/Kconfig +++ b/drivers/serial/Kconfig @@ -866,7 +866,7 @@ config SERIAL_M32R_PLDSIO config SERIAL_TXX9 bool "TMPTX39XX/49XX SIO support" - depends HAS_TXX9_SERIAL && BROKEN + depends HAS_TXX9_SERIAL select SERIAL_CORE default y diff --git a/drivers/serial/serial_txx9.c b/drivers/serial/serial_txx9.c index ee98a867bc6..141173efd46 100644 --- a/drivers/serial/serial_txx9.c +++ b/drivers/serial/serial_txx9.c @@ -33,6 +33,10 @@ * 1.02 Cleanup. (import 8250.c changes) * 1.03 Fix low-latency mode. (import 8250.c changes) * 1.04 Remove usage of deprecated functions, cleanup. + * 1.05 More strict check in verify_port. Cleanup. + * 1.06 Do not insert a char caused previous overrun. + * Fix some spin_locks. + * Do not call uart_add_one_port for absent ports. */ #include <linux/config.h> @@ -57,7 +61,7 @@ #include <asm/io.h> #include <asm/irq.h> -static char *serial_version = "1.04"; +static char *serial_version = "1.06"; static char *serial_name = "TX39/49 Serial driver"; #define PASS_LIMIT 256 @@ -94,6 +98,8 @@ static char *serial_name = "TX39/49 Serial driver"; #define UART_NR 4 #endif +#define HIGH_BITS_OFFSET ((sizeof(long)-sizeof(int))*8) + struct uart_txx9_port { struct uart_port port; @@ -210,7 +216,7 @@ static inline unsigned int sio_in(struct uart_txx9_port *up, int offset) { switch (up->port.iotype) { default: - return *(volatile u32 *)(up->port.membase + offset); + return __raw_readl(up->port.membase + offset); case UPIO_PORT: return inl(up->port.iobase + offset); } @@ -221,7 +227,7 @@ sio_out(struct uart_txx9_port *up, int offset, int value) { switch (up->port.iotype) { default: - *(volatile u32 *)(up->port.membase + offset) = value; + __raw_writel(value, up->port.membase + offset); break; case UPIO_PORT: outl(value, up->port.iobase + offset); @@ -259,34 +265,19 @@ sio_quot_set(struct uart_txx9_port *up, int quot) static void serial_txx9_stop_tx(struct uart_port *port) { struct uart_txx9_port *up = (struct uart_txx9_port *)port; - unsigned long flags; - - spin_lock_irqsave(&up->port.lock, flags); sio_mask(up, TXX9_SIDICR, TXX9_SIDICR_TIE); - spin_unlock_irqrestore(&up->port.lock, flags); } static void serial_txx9_start_tx(struct uart_port *port) { struct uart_txx9_port *up = (struct uart_txx9_port *)port; - unsigned long flags; - - spin_lock_irqsave(&up->port.lock, flags); sio_set(up, TXX9_SIDICR, TXX9_SIDICR_TIE); - spin_unlock_irqrestore(&up->port.lock, flags); } static void serial_txx9_stop_rx(struct uart_port *port) { struct uart_txx9_port *up = (struct uart_txx9_port *)port; - unsigned long flags; - - spin_lock_irqsave(&up->port.lock, flags); up->port.read_status_mask &= ~TXX9_SIDISR_RDIS; -#if 0 - sio_mask(up, TXX9_SIDICR, TXX9_SIDICR_RIE); -#endif - spin_unlock_irqrestore(&up->port.lock, flags); } static void serial_txx9_enable_ms(struct uart_port *port) @@ -302,12 +293,16 @@ receive_chars(struct uart_txx9_port *up, unsigned int *status, struct pt_regs *r unsigned int disr = *status; int max_count = 256; char flag; + unsigned int next_ignore_status_mask; do { ch = sio_in(up, TXX9_SIRFIFO); flag = TTY_NORMAL; up->port.icount.rx++; + /* mask out RFDN_MASK bit added by previous overrun */ + next_ignore_status_mask = + up->port.ignore_status_mask & ~TXX9_SIDISR_RFDN_MASK; if (unlikely(disr & (TXX9_SIDISR_UBRK | TXX9_SIDISR_UPER | TXX9_SIDISR_UFER | TXX9_SIDISR_UOER))) { /* @@ -328,8 +323,17 @@ receive_chars(struct uart_txx9_port *up, unsigned int *status, struct pt_regs *r up->port.icount.parity++; else if (disr & TXX9_SIDISR_UFER) up->port.icount.frame++; - if (disr & TXX9_SIDISR_UOER) + if (disr & TXX9_SIDISR_UOER) { up->port.icount.overrun++; + /* + * The receiver read buffer still hold + * a char which caused overrun. + * Ignore next char by adding RFDN_MASK + * to ignore_status_mask temporarily. + */ + next_ignore_status_mask |= + TXX9_SIDISR_RFDN_MASK; + } /* * Mask off conditions which should be ingored. @@ -349,6 +353,7 @@ receive_chars(struct uart_txx9_port *up, unsigned int *status, struct pt_regs *r uart_insert_char(&up->port, disr, TXX9_SIDISR_UOER, ch, flag); ignore_char: + up->port.ignore_status_mask = next_ignore_status_mask; disr = sio_in(up, TXX9_SIDISR); } while (!(disr & TXX9_SIDISR_UVALID) && (max_count-- > 0)); spin_unlock(&up->port.lock); @@ -450,14 +455,11 @@ static unsigned int serial_txx9_get_mctrl(struct uart_port *port) static void serial_txx9_set_mctrl(struct uart_port *port, unsigned int mctrl) { struct uart_txx9_port *up = (struct uart_txx9_port *)port; - unsigned long flags; - spin_lock_irqsave(&up->port.lock, flags); if (mctrl & TIOCM_RTS) sio_mask(up, TXX9_SIFLCR, TXX9_SIFLCR_RTSSC); else sio_set(up, TXX9_SIFLCR, TXX9_SIFLCR_RTSSC); - spin_unlock_irqrestore(&up->port.lock, flags); } static void serial_txx9_break_ctl(struct uart_port *port, int break_state) @@ -784,8 +786,14 @@ static void serial_txx9_config_port(struct uart_port *port, int uflags) static int serial_txx9_verify_port(struct uart_port *port, struct serial_struct *ser) { - if (ser->irq < 0 || - ser->baud_base < 9600 || ser->type != PORT_TXX9) + unsigned long new_port = ser->port; + if (HIGH_BITS_OFFSET) + new_port += (unsigned long)ser->port_high << HIGH_BITS_OFFSET; + if (ser->type != port->type || + ser->irq != port->irq || + ser->io_type != port->iotype || + new_port != port->iobase || + (unsigned long)ser->iomem_base != port->mapbase) return -EINVAL; return 0; } @@ -827,7 +835,8 @@ static void __init serial_txx9_register_ports(struct uart_driver *drv) up->port.line = i; up->port.ops = &serial_txx9_pops; - uart_add_one_port(drv, &up->port); + if (up->port.iobase || up->port.mapbase) + uart_add_one_port(drv, &up->port); } } @@ -927,11 +936,6 @@ static int serial_txx9_console_setup(struct console *co, char *options) return -ENODEV; /* - * Temporary fix. - */ - spin_lock_init(&port->lock); - - /* * Disable UART interrupts, set DTR and RTS high * and set speed. */ @@ -1041,11 +1045,10 @@ static int __devinit serial_txx9_register_port(struct uart_port *port) mutex_lock(&serial_txx9_mutex); for (i = 0; i < UART_NR; i++) { uart = &serial_txx9_ports[i]; - if (uart->port.type == PORT_UNKNOWN) + if (!(uart->port.iobase || uart->port.mapbase)) break; } if (i < UART_NR) { - uart_remove_one_port(&serial_txx9_reg, &uart->port); uart->port.iobase = port->iobase; uart->port.membase = port->membase; uart->port.irq = port->irq; @@ -1080,9 +1083,8 @@ static void __devexit serial_txx9_unregister_port(int line) uart->port.type = PORT_UNKNOWN; uart->port.iobase = 0; uart->port.mapbase = 0; - uart->port.membase = 0; + uart->port.membase = NULL; uart->port.dev = NULL; - uart_add_one_port(&serial_txx9_reg, &uart->port); mutex_unlock(&serial_txx9_mutex); } @@ -1198,8 +1200,11 @@ static void __exit serial_txx9_exit(void) #ifdef ENABLE_SERIAL_TXX9_PCI pci_unregister_driver(&serial_txx9_pci_driver); #endif - for (i = 0; i < UART_NR; i++) - uart_remove_one_port(&serial_txx9_reg, &serial_txx9_ports[i].port); + for (i = 0; i < UART_NR; i++) { + struct uart_txx9_port *up = &serial_txx9_ports[i]; + if (up->port.iobase || up->port.mapbase) + uart_remove_one_port(&serial_txx9_reg, &up->port); + } uart_unregister_driver(&serial_txx9_reg); } diff --git a/drivers/serial/vr41xx_siu.c b/drivers/serial/vr41xx_siu.c index d61494d185c..bd6294132c1 100644 --- a/drivers/serial/vr41xx_siu.c +++ b/drivers/serial/vr41xx_siu.c @@ -919,7 +919,7 @@ static struct uart_driver siu_uart_driver = { .cons = SERIAL_VR41XX_CONSOLE, }; -static int siu_probe(struct platform_device *dev) +static int __devinit siu_probe(struct platform_device *dev) { struct uart_port *port; int num, i, retval; @@ -953,7 +953,7 @@ static int siu_probe(struct platform_device *dev) return 0; } -static int siu_remove(struct platform_device *dev) +static int __devexit siu_remove(struct platform_device *dev) { struct uart_port *port; int i; @@ -1006,21 +1006,28 @@ static struct platform_device *siu_platform_device; static struct platform_driver siu_device_driver = { .probe = siu_probe, - .remove = siu_remove, + .remove = __devexit_p(siu_remove), .suspend = siu_suspend, .resume = siu_resume, .driver = { .name = "SIU", + .owner = THIS_MODULE, }, }; -static int __devinit vr41xx_siu_init(void) +static int __init vr41xx_siu_init(void) { int retval; - siu_platform_device = platform_device_register_simple("SIU", -1, NULL, 0); - if (IS_ERR(siu_platform_device)) - return PTR_ERR(siu_platform_device); + siu_platform_device = platform_device_alloc("SIU", -1); + if (!siu_platform_device) + return -ENOMEM; + + retval = platform_device_add(siu_platform_device); + if (retval < 0) { + platform_device_put(siu_platform_device); + return retval; + } retval = platform_driver_register(&siu_device_driver); if (retval < 0) @@ -1029,10 +1036,9 @@ static int __devinit vr41xx_siu_init(void) return retval; } -static void __devexit vr41xx_siu_exit(void) +static void __exit vr41xx_siu_exit(void) { platform_driver_unregister(&siu_device_driver); - platform_device_unregister(siu_platform_device); } diff --git a/drivers/sn/ioc4.c b/drivers/sn/ioc4.c index ea75b3d0612..67140a5804f 100644 --- a/drivers/sn/ioc4.c +++ b/drivers/sn/ioc4.c @@ -31,7 +31,7 @@ #include <linux/ioc4.h> #include <linux/mmtimer.h> #include <linux/rtc.h> -#include <linux/rwsem.h> +#include <linux/mutex.h> #include <asm/sn/addrs.h> #include <asm/sn/clksupport.h> #include <asm/sn/shub_mmr.h> @@ -54,11 +54,10 @@ * Submodule management * ************************/ -static LIST_HEAD(ioc4_devices); -static DECLARE_RWSEM(ioc4_devices_rwsem); +static DEFINE_MUTEX(ioc4_mutex); +static LIST_HEAD(ioc4_devices); static LIST_HEAD(ioc4_submodules); -static DECLARE_RWSEM(ioc4_submodules_rwsem); /* Register an IOC4 submodule */ int @@ -66,15 +65,13 @@ ioc4_register_submodule(struct ioc4_submodule *is) { struct ioc4_driver_data *idd; - down_write(&ioc4_submodules_rwsem); + mutex_lock(&ioc4_mutex); list_add(&is->is_list, &ioc4_submodules); - up_write(&ioc4_submodules_rwsem); /* Initialize submodule for each IOC4 */ if (!is->is_probe) - return 0; + goto out; - down_read(&ioc4_devices_rwsem); list_for_each_entry(idd, &ioc4_devices, idd_list) { if (is->is_probe(idd)) { printk(KERN_WARNING @@ -84,8 +81,8 @@ ioc4_register_submodule(struct ioc4_submodule *is) pci_name(idd->idd_pdev)); } } - up_read(&ioc4_devices_rwsem); - + out: + mutex_unlock(&ioc4_mutex); return 0; } @@ -95,15 +92,13 @@ ioc4_unregister_submodule(struct ioc4_submodule *is) { struct ioc4_driver_data *idd; - down_write(&ioc4_submodules_rwsem); + mutex_lock(&ioc4_mutex); list_del(&is->is_list); - up_write(&ioc4_submodules_rwsem); /* Remove submodule for each IOC4 */ if (!is->is_remove) - return; + goto out; - down_read(&ioc4_devices_rwsem); list_for_each_entry(idd, &ioc4_devices, idd_list) { if (is->is_remove(idd)) { printk(KERN_WARNING @@ -113,7 +108,8 @@ ioc4_unregister_submodule(struct ioc4_submodule *is) pci_name(idd->idd_pdev)); } } - up_read(&ioc4_devices_rwsem); + out: + mutex_unlock(&ioc4_mutex); } /********************* @@ -312,12 +308,11 @@ ioc4_probe(struct pci_dev *pdev, const struct pci_device_id *pci_id) /* Track PCI-device specific data */ idd->idd_serial_data = NULL; pci_set_drvdata(idd->idd_pdev, idd); - down_write(&ioc4_devices_rwsem); + + mutex_lock(&ioc4_mutex); list_add(&idd->idd_list, &ioc4_devices); - up_write(&ioc4_devices_rwsem); /* Add this IOC4 to all submodules */ - down_read(&ioc4_submodules_rwsem); list_for_each_entry(is, &ioc4_submodules, is_list) { if (is->is_probe && is->is_probe(idd)) { printk(KERN_WARNING @@ -327,7 +322,7 @@ ioc4_probe(struct pci_dev *pdev, const struct pci_device_id *pci_id) pci_name(idd->idd_pdev)); } } - up_read(&ioc4_submodules_rwsem); + mutex_unlock(&ioc4_mutex); return 0; @@ -351,7 +346,7 @@ ioc4_remove(struct pci_dev *pdev) idd = pci_get_drvdata(pdev); /* Remove this IOC4 from all submodules */ - down_read(&ioc4_submodules_rwsem); + mutex_lock(&ioc4_mutex); list_for_each_entry(is, &ioc4_submodules, is_list) { if (is->is_remove && is->is_remove(idd)) { printk(KERN_WARNING @@ -361,7 +356,7 @@ ioc4_remove(struct pci_dev *pdev) pci_name(idd->idd_pdev)); } } - up_read(&ioc4_submodules_rwsem); + mutex_unlock(&ioc4_mutex); /* Release resources */ iounmap(idd->idd_misc_regs); @@ -377,9 +372,9 @@ ioc4_remove(struct pci_dev *pdev) pci_disable_device(pdev); /* Remove and free driver data */ - down_write(&ioc4_devices_rwsem); + mutex_lock(&ioc4_mutex); list_del(&idd->idd_list); - up_write(&ioc4_devices_rwsem); + mutex_unlock(&ioc4_mutex); kfree(idd); } diff --git a/drivers/video/acornfb.c b/drivers/video/acornfb.c index b058273527b..76448d6ae89 100644 --- a/drivers/video/acornfb.c +++ b/drivers/video/acornfb.c @@ -1269,7 +1269,7 @@ free_unused_pages(unsigned int virtual_start, unsigned int virtual_end) */ page = virt_to_page(virtual_start); ClearPageReserved(page); - set_page_count(page, 1); + init_page_count(page); free_page(virtual_start); virtual_start += PAGE_SIZE; diff --git a/drivers/video/i810/i810_main.c b/drivers/video/i810/i810_main.c index d8467c03b49..788297e9d59 100644 --- a/drivers/video/i810/i810_main.c +++ b/drivers/video/i810/i810_main.c @@ -1508,7 +1508,7 @@ static int i810fb_cursor(struct fb_info *info, struct fb_cursor *cursor) int size = ((cursor->image.width + 7) >> 3) * cursor->image.height; int i; - u8 *data = kmalloc(64 * 8, GFP_KERNEL); + u8 *data = kmalloc(64 * 8, GFP_ATOMIC); if (data == NULL) return -ENOMEM; diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 3ad8455f857..651a9e14d9a 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -614,6 +614,7 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry, sb = dir->i_sb; v9ses = v9fs_inode2v9ses(dir); + dentry->d_op = &v9fs_dentry_operations; dirfid = v9fs_fid_lookup(dentry->d_parent); if (!dirfid) { @@ -681,8 +682,6 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry, goto FreeFcall; fid->qid = fcall->params.rstat.stat.qid; - - dentry->d_op = &v9fs_dentry_operations; v9fs_stat2inode(&fcall->params.rstat.stat, inode, inode->i_sb); d_add(dentry, inode); diff --git a/fs/buffer.c b/fs/buffer.c index a9b39940200..1d3683d496f 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -3051,68 +3051,6 @@ asmlinkage long sys_bdflush(int func, long data) } /* - * Migration function for pages with buffers. This function can only be used - * if the underlying filesystem guarantees that no other references to "page" - * exist. - */ -#ifdef CONFIG_MIGRATION -int buffer_migrate_page(struct page *newpage, struct page *page) -{ - struct address_space *mapping = page->mapping; - struct buffer_head *bh, *head; - int rc; - - if (!mapping) - return -EAGAIN; - - if (!page_has_buffers(page)) - return migrate_page(newpage, page); - - head = page_buffers(page); - - rc = migrate_page_remove_references(newpage, page, 3); - if (rc) - return rc; - - bh = head; - do { - get_bh(bh); - lock_buffer(bh); - bh = bh->b_this_page; - - } while (bh != head); - - ClearPagePrivate(page); - set_page_private(newpage, page_private(page)); - set_page_private(page, 0); - put_page(page); - get_page(newpage); - - bh = head; - do { - set_bh_page(bh, newpage, bh_offset(bh)); - bh = bh->b_this_page; - - } while (bh != head); - - SetPagePrivate(newpage); - - migrate_page_copy(newpage, page); - - bh = head; - do { - unlock_buffer(bh); - put_bh(bh); - bh = bh->b_this_page; - - } while (bh != head); - - return 0; -} -EXPORT_SYMBOL(buffer_migrate_page); -#endif - -/* * Buffer-head allocation */ static kmem_cache_t *bh_cachep; diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index b3519528994..25fa8bba8cb 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -56,48 +56,10 @@ static void huge_pagevec_release(struct pagevec *pvec) pagevec_reinit(pvec); } -/* - * huge_pages_needed tries to determine the number of new huge pages that - * will be required to fully populate this VMA. This will be equal to - * the size of the VMA in huge pages minus the number of huge pages - * (covered by this VMA) that are found in the page cache. - * - * Result is in bytes to be compatible with is_hugepage_mem_enough() - */ -static unsigned long -huge_pages_needed(struct address_space *mapping, struct vm_area_struct *vma) -{ - int i; - struct pagevec pvec; - unsigned long start = vma->vm_start; - unsigned long end = vma->vm_end; - unsigned long hugepages = (end - start) >> HPAGE_SHIFT; - pgoff_t next = vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT); - pgoff_t endpg = next + hugepages; - - pagevec_init(&pvec, 0); - while (next < endpg) { - if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) - break; - for (i = 0; i < pagevec_count(&pvec); i++) { - struct page *page = pvec.pages[i]; - if (page->index > next) - next = page->index; - if (page->index >= endpg) - break; - next++; - hugepages--; - } - huge_pagevec_release(&pvec); - } - return hugepages << HPAGE_SHIFT; -} - static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) { struct inode *inode = file->f_dentry->d_inode; - struct address_space *mapping = inode->i_mapping; - unsigned long bytes; + struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode); loff_t len, vma_len; int ret; @@ -113,10 +75,6 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) if (vma->vm_end - vma->vm_start < HPAGE_SIZE) return -EINVAL; - bytes = huge_pages_needed(mapping, vma); - if (!is_hugepage_mem_enough(bytes)) - return -ENOMEM; - vma_len = (loff_t)(vma->vm_end - vma->vm_start); mutex_lock(&inode->i_mutex); @@ -129,6 +87,10 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) if (!(vma->vm_flags & VM_WRITE) && len > inode->i_size) goto out; + if (vma->vm_flags & VM_MAYSHARE) + if (hugetlb_extend_reservation(info, len >> HPAGE_SHIFT) != 0) + goto out; + ret = 0; hugetlb_prefault_arch_hook(vma->vm_mm); if (inode->i_size < len) @@ -227,13 +189,18 @@ static void truncate_huge_page(struct page *page) put_page(page); } -static void truncate_hugepages(struct address_space *mapping, loff_t lstart) +static void truncate_hugepages(struct inode *inode, loff_t lstart) { + struct address_space *mapping = &inode->i_data; const pgoff_t start = lstart >> HPAGE_SHIFT; struct pagevec pvec; pgoff_t next; int i; + hugetlb_truncate_reservation(HUGETLBFS_I(inode), + lstart >> HPAGE_SHIFT); + if (!mapping->nrpages) + return; pagevec_init(&pvec, 0); next = start; while (1) { @@ -262,8 +229,7 @@ static void truncate_hugepages(struct address_space *mapping, loff_t lstart) static void hugetlbfs_delete_inode(struct inode *inode) { - if (inode->i_data.nrpages) - truncate_hugepages(&inode->i_data, 0); + truncate_hugepages(inode, 0); clear_inode(inode); } @@ -296,8 +262,7 @@ static void hugetlbfs_forget_inode(struct inode *inode) inode->i_state |= I_FREEING; inodes_stat.nr_inodes--; spin_unlock(&inode_lock); - if (inode->i_data.nrpages) - truncate_hugepages(&inode->i_data, 0); + truncate_hugepages(inode, 0); clear_inode(inode); destroy_inode(inode); } @@ -356,7 +321,7 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset) if (!prio_tree_empty(&mapping->i_mmap)) hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff); spin_unlock(&mapping->i_mmap_lock); - truncate_hugepages(mapping, offset); + truncate_hugepages(inode, offset); return 0; } @@ -573,6 +538,7 @@ static struct inode *hugetlbfs_alloc_inode(struct super_block *sb) hugetlbfs_inc_free_inodes(sbinfo); return NULL; } + p->prereserved_hpages = 0; return &p->vfs_inode; } @@ -771,21 +737,6 @@ static struct file_system_type hugetlbfs_fs_type = { static struct vfsmount *hugetlbfs_vfsmount; -/* - * Return the next identifier for a shm file - */ -static unsigned long hugetlbfs_counter(void) -{ - static DEFINE_SPINLOCK(lock); - static unsigned long counter; - unsigned long ret; - - spin_lock(&lock); - ret = ++counter; - spin_unlock(&lock); - return ret; -} - static int can_do_hugetlb_shm(void) { return likely(capable(CAP_IPC_LOCK) || @@ -801,18 +752,16 @@ struct file *hugetlb_zero_setup(size_t size) struct dentry *dentry, *root; struct qstr quick_string; char buf[16]; + static atomic_t counter; if (!can_do_hugetlb_shm()) return ERR_PTR(-EPERM); - if (!is_hugepage_mem_enough(size)) - return ERR_PTR(-ENOMEM); - if (!user_shm_lock(size, current->user)) return ERR_PTR(-ENOMEM); root = hugetlbfs_vfsmount->mnt_root; - snprintf(buf, 16, "%lu", hugetlbfs_counter()); + snprintf(buf, 16, "%u", atomic_inc_return(&counter)); quick_string.name = buf; quick_string.len = strlen(quick_string.name); quick_string.hash = 0; @@ -831,6 +780,11 @@ struct file *hugetlb_zero_setup(size_t size) if (!inode) goto out_file; + error = -ENOMEM; + if (hugetlb_extend_reservation(HUGETLBFS_I(inode), + size >> HPAGE_SHIFT) != 0) + goto out_inode; + d_instantiate(dentry, inode); inode->i_size = size; inode->i_nlink = 0; @@ -841,6 +795,8 @@ struct file *hugetlb_zero_setup(size_t size) file->f_mode = FMODE_WRITE | FMODE_READ; return file; +out_inode: + iput(inode); out_file: put_filp(file); out_dentry: diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 8dd3aafec49..09e1c57a86a 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -959,7 +959,7 @@ static int ocfs2_initialize_mem_caches(void) ocfs2_lock_cache = kmem_cache_create("ocfs2_lock", sizeof(struct ocfs2_journal_lock), 0, - SLAB_NO_REAP|SLAB_HWCACHE_ALIGN, + SLAB_HWCACHE_ALIGN, NULL, NULL); if (!ocfs2_lock_cache) return -ENOMEM; diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c index 3f810acd0bf..b1ca234068f 100644 --- a/fs/ramfs/file-nommu.c +++ b/fs/ramfs/file-nommu.c @@ -87,8 +87,7 @@ static int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize) xpages = 1UL << order; npages = (newsize + PAGE_SIZE - 1) >> PAGE_SHIFT; - for (loop = 0; loop < npages; loop++) - set_page_count(pages + loop, 1); + split_page(pages, order); /* trim off any pages we don't actually require */ for (loop = npages; loop < xpages; loop++) diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index bfb4f2917bb..8cdfa415165 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -29,6 +29,7 @@ #include <linux/blkdev.h> #include <linux/hash.h> #include <linux/kthread.h> +#include <linux/migrate.h> #include "xfs_linux.h" STATIC kmem_zone_t *xfs_buf_zone; diff --git a/include/asm-i386/acpi.h b/include/asm-i386/acpi.h index 55059abf9c9..20f52395421 100644 --- a/include/asm-i386/acpi.h +++ b/include/asm-i386/acpi.h @@ -103,6 +103,12 @@ __acpi_release_global_lock (unsigned int *lock) :"=r"(n_hi), "=r"(n_lo) \ :"0"(n_hi), "1"(n_lo)) +#ifdef CONFIG_X86_IO_APIC +extern void check_acpi_pci(void); +#else +static inline void check_acpi_pci(void) { } +#endif + #ifdef CONFIG_ACPI extern int acpi_lapic; extern int acpi_ioapic; @@ -128,8 +134,6 @@ extern int acpi_gsi_to_irq(u32 gsi, unsigned int *irq); extern int skip_ioapic_setup; extern int acpi_skip_timer_override; -extern void check_acpi_pci(void); - static inline void disable_ioapic_setup(void) { skip_ioapic_setup = 1; @@ -142,8 +146,6 @@ static inline int ioapic_setup_disabled(void) #else static inline void disable_ioapic_setup(void) { } -static inline void check_acpi_pci(void) { } - #endif static inline void acpi_noirq_set(void) { acpi_noirq = 1; } diff --git a/include/asm-i386/pgtable.h b/include/asm-i386/pgtable.h index 088a945bf26..ee056c41a9f 100644 --- a/include/asm-i386/pgtable.h +++ b/include/asm-i386/pgtable.h @@ -219,13 +219,12 @@ extern unsigned long pg0[]; * The following only work if pte_present() is true. * Undefined behaviour if not.. */ -#define __LARGE_PTE (_PAGE_PSE | _PAGE_PRESENT) static inline int pte_user(pte_t pte) { return (pte).pte_low & _PAGE_USER; } static inline int pte_read(pte_t pte) { return (pte).pte_low & _PAGE_USER; } static inline int pte_dirty(pte_t pte) { return (pte).pte_low & _PAGE_DIRTY; } static inline int pte_young(pte_t pte) { return (pte).pte_low & _PAGE_ACCESSED; } static inline int pte_write(pte_t pte) { return (pte).pte_low & _PAGE_RW; } -static inline int pte_huge(pte_t pte) { return ((pte).pte_low & __LARGE_PTE) == __LARGE_PTE; } +static inline int pte_huge(pte_t pte) { return (pte).pte_low & _PAGE_PSE; } /* * The following only works if pte_present() is not true. @@ -242,7 +241,7 @@ static inline pte_t pte_mkexec(pte_t pte) { (pte).pte_low |= _PAGE_USER; return static inline pte_t pte_mkdirty(pte_t pte) { (pte).pte_low |= _PAGE_DIRTY; return pte; } static inline pte_t pte_mkyoung(pte_t pte) { (pte).pte_low |= _PAGE_ACCESSED; return pte; } static inline pte_t pte_mkwrite(pte_t pte) { (pte).pte_low |= _PAGE_RW; return pte; } -static inline pte_t pte_mkhuge(pte_t pte) { (pte).pte_low |= __LARGE_PTE; return pte; } +static inline pte_t pte_mkhuge(pte_t pte) { (pte).pte_low |= _PAGE_PSE; return pte; } #ifdef CONFIG_X86_PAE # include <asm/pgtable-3level.h> diff --git a/include/asm-ia64/intel_intrin.h b/include/asm-ia64/intel_intrin.h index a7122d85017..d069b6acddc 100644 --- a/include/asm-ia64/intel_intrin.h +++ b/include/asm-ia64/intel_intrin.h @@ -5,113 +5,10 @@ * * Copyright (C) 2002,2003 Jun Nakajima <jun.nakajima@intel.com> * Copyright (C) 2002,2003 Suresh Siddha <suresh.b.siddha@intel.com> + * Copyright (C) 2005,2006 Hongjiu Lu <hongjiu.lu@intel.com> * */ -#include <asm/types.h> - -void __lfetch(int lfhint, void *y); -void __lfetch_excl(int lfhint, void *y); -void __lfetch_fault(int lfhint, void *y); -void __lfetch_fault_excl(int lfhint, void *y); - -/* In the following, whichFloatReg should be an integer from 0-127 */ -void __ldfs(const int whichFloatReg, void *src); -void __ldfd(const int whichFloatReg, void *src); -void __ldfe(const int whichFloatReg, void *src); -void __ldf8(const int whichFloatReg, void *src); -void __ldf_fill(const int whichFloatReg, void *src); -void __stfs(void *dst, const int whichFloatReg); -void __stfd(void *dst, const int whichFloatReg); -void __stfe(void *dst, const int whichFloatReg); -void __stf8(void *dst, const int whichFloatReg); -void __stf_spill(void *dst, const int whichFloatReg); - -void __st1_rel(void *dst, const __s8 value); -void __st2_rel(void *dst, const __s16 value); -void __st4_rel(void *dst, const __s32 value); -void __st8_rel(void *dst, const __s64 value); -__u8 __ld1_acq(void *src); -__u16 __ld2_acq(void *src); -__u32 __ld4_acq(void *src); -__u64 __ld8_acq(void *src); - -__u64 __fetchadd4_acq(__u32 *addend, const int increment); -__u64 __fetchadd4_rel(__u32 *addend, const int increment); -__u64 __fetchadd8_acq(__u64 *addend, const int increment); -__u64 __fetchadd8_rel(__u64 *addend, const int increment); - -__u64 __getf_exp(double d); - -/* OS Related Itanium(R) Intrinsics */ - -/* The names to use for whichReg and whichIndReg below come from - the include file asm/ia64regs.h */ - -__u64 __getIndReg(const int whichIndReg, __s64 index); -__u64 __getReg(const int whichReg); - -void __setIndReg(const int whichIndReg, __s64 index, __u64 value); -void __setReg(const int whichReg, __u64 value); - -void __mf(void); -void __mfa(void); -void __synci(void); -void __itcd(__s64 pa); -void __itci(__s64 pa); -void __itrd(__s64 whichTransReg, __s64 pa); -void __itri(__s64 whichTransReg, __s64 pa); -void __ptce(__s64 va); -void __ptcl(__s64 va, __s64 pagesz); -void __ptcg(__s64 va, __s64 pagesz); -void __ptcga(__s64 va, __s64 pagesz); -void __ptri(__s64 va, __s64 pagesz); -void __ptrd(__s64 va, __s64 pagesz); -void __invala (void); -void __invala_gr(const int whichGeneralReg /* 0-127 */ ); -void __invala_fr(const int whichFloatReg /* 0-127 */ ); -void __nop(const int); -void __fc(__u64 *addr); -void __sum(int mask); -void __rum(int mask); -void __ssm(int mask); -void __rsm(int mask); -__u64 __thash(__s64); -__u64 __ttag(__s64); -__s64 __tpa(__s64); - -/* Intrinsics for implementing get/put_user macros */ -void __st_user(const char *tableName, __u64 addr, char size, char relocType, __u64 val); -void __ld_user(const char *tableName, __u64 addr, char size, char relocType); - -/* This intrinsic does not generate code, it creates a barrier across which - * the compiler will not schedule data access instructions. - */ -void __memory_barrier(void); - -void __isrlz(void); -void __dsrlz(void); - -__u64 _m64_mux1(__u64 a, const int n); -__u64 __thash(__u64); - -/* Lock and Atomic Operation Related Intrinsics */ -__u64 _InterlockedExchange8(volatile __u8 *trgt, __u8 value); -__u64 _InterlockedExchange16(volatile __u16 *trgt, __u16 value); -__s64 _InterlockedExchange(volatile __u32 *trgt, __u32 value); -__s64 _InterlockedExchange64(volatile __u64 *trgt, __u64 value); - -__u64 _InterlockedCompareExchange8_rel(volatile __u8 *dest, __u64 xchg, __u64 comp); -__u64 _InterlockedCompareExchange8_acq(volatile __u8 *dest, __u64 xchg, __u64 comp); -__u64 _InterlockedCompareExchange16_rel(volatile __u16 *dest, __u64 xchg, __u64 comp); -__u64 _InterlockedCompareExchange16_acq(volatile __u16 *dest, __u64 xchg, __u64 comp); -__u64 _InterlockedCompareExchange_rel(volatile __u32 *dest, __u64 xchg, __u64 comp); -__u64 _InterlockedCompareExchange_acq(volatile __u32 *dest, __u64 xchg, __u64 comp); -__u64 _InterlockedCompareExchange64_rel(volatile __u64 *dest, __u64 xchg, __u64 comp); -__u64 _InterlockedCompareExchange64_acq(volatile __u64 *dest, __u64 xchg, __u64 comp); - -__s64 _m64_dep_mi(const int v, __s64 s, const int p, const int len); -__s64 _m64_shrp(__s64 a, __s64 b, const int count); -__s64 _m64_popcnt(__s64 a); +#include <ia64intrin.h> #define ia64_barrier() __memory_barrier() @@ -122,15 +19,16 @@ __s64 _m64_popcnt(__s64 a); #define ia64_getreg __getReg #define ia64_setreg __setReg -#define ia64_hint(x) +#define ia64_hint __hint +#define ia64_hint_pause __hint_pause -#define ia64_mux1_brcst 0 -#define ia64_mux1_mix 8 -#define ia64_mux1_shuf 9 -#define ia64_mux1_alt 10 -#define ia64_mux1_rev 11 +#define ia64_mux1_brcst _m64_mux1_brcst +#define ia64_mux1_mix _m64_mux1_mix +#define ia64_mux1_shuf _m64_mux1_shuf +#define ia64_mux1_alt _m64_mux1_alt +#define ia64_mux1_rev _m64_mux1_rev -#define ia64_mux1 _m64_mux1 +#define ia64_mux1(x,v) _m_to_int64(_m64_mux1(_m_from_int64(x), (v))) #define ia64_popcnt _m64_popcnt #define ia64_getf_exp __getf_exp #define ia64_shrp _m64_shrp @@ -158,7 +56,7 @@ __s64 _m64_popcnt(__s64 a); #define ia64_stf8 __stf8 #define ia64_stf_spill __stf_spill -#define ia64_mf __mf +#define ia64_mf __mf #define ia64_mfa __mfa #define ia64_fetchadd4_acq __fetchadd4_acq @@ -234,10 +132,10 @@ __s64 _m64_popcnt(__s64 a); /* Values for lfhint in __lfetch and __lfetch_fault */ -#define ia64_lfhint_none 0 -#define ia64_lfhint_nt1 1 -#define ia64_lfhint_nt2 2 -#define ia64_lfhint_nta 3 +#define ia64_lfhint_none __lfhint_none +#define ia64_lfhint_nt1 __lfhint_nt1 +#define ia64_lfhint_nt2 __lfhint_nt2 +#define ia64_lfhint_nta __lfhint_nta #define ia64_lfetch __lfetch #define ia64_lfetch_excl __lfetch_excl @@ -254,4 +152,6 @@ do { \ } \ } while (0) +#define __builtin_trap() __break(0); + #endif /* _ASM_IA64_INTEL_INTRIN_H */ diff --git a/include/asm-ia64/machvec.h b/include/asm-ia64/machvec.h index ca5ea994d68..c3e4ed8a3e1 100644 --- a/include/asm-ia64/machvec.h +++ b/include/asm-ia64/machvec.h @@ -20,6 +20,7 @@ struct scatterlist; struct page; struct mm_struct; struct pci_bus; +struct task_struct; typedef void ia64_mv_setup_t (char **); typedef void ia64_mv_cpu_init_t (void); @@ -34,6 +35,7 @@ typedef int ia64_mv_pci_legacy_read_t (struct pci_bus *, u16 port, u32 *val, u8 size); typedef int ia64_mv_pci_legacy_write_t (struct pci_bus *, u16 port, u32 val, u8 size); +typedef void ia64_mv_migrate_t(struct task_struct * task); /* DMA-mapping interface: */ typedef void ia64_mv_dma_init (void); @@ -85,6 +87,11 @@ machvec_noop_mm (struct mm_struct *mm) { } +static inline void +machvec_noop_task (struct task_struct *task) +{ +} + extern void machvec_setup (char **); extern void machvec_timer_interrupt (int, void *, struct pt_regs *); extern void machvec_dma_sync_single (struct device *, dma_addr_t, size_t, int); @@ -146,6 +153,7 @@ extern void machvec_tlb_migrate_finish (struct mm_struct *); # define platform_readw_relaxed ia64_mv.readw_relaxed # define platform_readl_relaxed ia64_mv.readl_relaxed # define platform_readq_relaxed ia64_mv.readq_relaxed +# define platform_migrate ia64_mv.migrate # endif /* __attribute__((__aligned__(16))) is required to make size of the @@ -194,6 +202,7 @@ struct ia64_machine_vector { ia64_mv_readw_relaxed_t *readw_relaxed; ia64_mv_readl_relaxed_t *readl_relaxed; ia64_mv_readq_relaxed_t *readq_relaxed; + ia64_mv_migrate_t *migrate; } __attribute__((__aligned__(16))); /* align attrib? see above comment */ #define MACHVEC_INIT(name) \ @@ -238,6 +247,7 @@ struct ia64_machine_vector { platform_readw_relaxed, \ platform_readl_relaxed, \ platform_readq_relaxed, \ + platform_migrate, \ } extern struct ia64_machine_vector ia64_mv; @@ -386,5 +396,8 @@ extern ia64_mv_dma_supported swiotlb_dma_supported; #ifndef platform_readq_relaxed # define platform_readq_relaxed __ia64_readq_relaxed #endif +#ifndef platform_migrate +# define platform_migrate machvec_noop_task +#endif #endif /* _ASM_IA64_MACHVEC_H */ diff --git a/include/asm-ia64/machvec_sn2.h b/include/asm-ia64/machvec_sn2.h index 03d00faf03b..da1d43755af 100644 --- a/include/asm-ia64/machvec_sn2.h +++ b/include/asm-ia64/machvec_sn2.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2002-2003, 2006 Silicon Graphics, Inc. All Rights Reserved. + * Copyright (c) 2002-2003,2006 Silicon Graphics, Inc. All Rights Reserved. * * This program is free software; you can redistribute it and/or modify it * under the terms of version 2 of the GNU General Public License @@ -66,6 +66,7 @@ extern ia64_mv_dma_sync_single_for_device sn_dma_sync_single_for_device; extern ia64_mv_dma_sync_sg_for_device sn_dma_sync_sg_for_device; extern ia64_mv_dma_mapping_error sn_dma_mapping_error; extern ia64_mv_dma_supported sn_dma_supported; +extern ia64_mv_migrate_t sn_migrate; /* * This stuff has dual use! @@ -115,6 +116,7 @@ extern ia64_mv_dma_supported sn_dma_supported; #define platform_dma_sync_sg_for_device sn_dma_sync_sg_for_device #define platform_dma_mapping_error sn_dma_mapping_error #define platform_dma_supported sn_dma_supported +#define platform_migrate sn_migrate #include <asm/sn/io.h> diff --git a/include/asm-ia64/mca.h b/include/asm-ia64/mca.h index c7d9c9ed38b..bfbbb8da79c 100644 --- a/include/asm-ia64/mca.h +++ b/include/asm-ia64/mca.h @@ -131,6 +131,8 @@ struct ia64_mca_cpu { /* Array of physical addresses of each CPU's MCA area. */ extern unsigned long __per_cpu_mca[NR_CPUS]; +extern int cpe_vector; +extern int ia64_cpe_irq; extern void ia64_mca_init(void); extern void ia64_mca_cpu_init(void *); extern void ia64_os_mca_dispatch(void); diff --git a/include/asm-ia64/mutex.h b/include/asm-ia64/mutex.h index 458c1f7fbc1..5a3224f6af3 100644 --- a/include/asm-ia64/mutex.h +++ b/include/asm-ia64/mutex.h @@ -1,9 +1,92 @@ /* - * Pull in the generic implementation for the mutex fastpath. + * ia64 implementation of the mutex fastpath. * - * TODO: implement optimized primitives instead, or leave the generic - * implementation in place, or pick the atomic_xchg() based generic - * implementation. (see asm-generic/mutex-xchg.h for details) + * Copyright (C) 2006 Ken Chen <kenneth.w.chen@intel.com> + * + */ + +#ifndef _ASM_MUTEX_H +#define _ASM_MUTEX_H + +/** + * __mutex_fastpath_lock - try to take the lock by moving the count + * from 1 to a 0 value + * @count: pointer of type atomic_t + * @fail_fn: function to call if the original value was not 1 + * + * Change the count from 1 to a value lower than 1, and call <fail_fn> if + * it wasn't 1 originally. This function MUST leave the value lower than + * 1 even when the "1" assertion wasn't true. + */ +static inline void +__mutex_fastpath_lock(atomic_t *count, void (*fail_fn)(atomic_t *)) +{ + if (unlikely(ia64_fetchadd4_acq(count, -1) != 1)) + fail_fn(count); +} + +/** + * __mutex_fastpath_lock_retval - try to take the lock by moving the count + * from 1 to a 0 value + * @count: pointer of type atomic_t + * @fail_fn: function to call if the original value was not 1 + * + * Change the count from 1 to a value lower than 1, and call <fail_fn> if + * it wasn't 1 originally. This function returns 0 if the fastpath succeeds, + * or anything the slow path function returns. + */ +static inline int +__mutex_fastpath_lock_retval(atomic_t *count, int (*fail_fn)(atomic_t *)) +{ + if (unlikely(ia64_fetchadd4_acq(count, -1) != 1)) + return fail_fn(count); + return 0; +} + +/** + * __mutex_fastpath_unlock - try to promote the count from 0 to 1 + * @count: pointer of type atomic_t + * @fail_fn: function to call if the original value was not 0 + * + * Try to promote the count from 0 to 1. If it wasn't 0, call <fail_fn>. + * In the failure case, this function is allowed to either set the value to + * 1, or to set it to a value lower than 1. + * + * If the implementation sets it to a value of lower than 1, then the + * __mutex_slowpath_needs_to_unlock() macro needs to return 1, it needs + * to return 0 otherwise. + */ +static inline void +__mutex_fastpath_unlock(atomic_t *count, void (*fail_fn)(atomic_t *)) +{ + int ret = ia64_fetchadd4_rel(count, 1); + if (unlikely(ret < 0)) + fail_fn(count); +} + +#define __mutex_slowpath_needs_to_unlock() 1 + +/** + * __mutex_fastpath_trylock - try to acquire the mutex, without waiting + * + * @count: pointer of type atomic_t + * @fail_fn: fallback function + * + * Change the count from 1 to a value lower than 1, and return 0 (failure) + * if it wasn't 1 originally, or return 1 (success) otherwise. This function + * MUST leave the value lower than 1 even when the "1" assertion wasn't true. + * Additionally, if the value was < 0 originally, this function must not leave + * it to 0 on failure. + * + * If the architecture has no effective trylock variant, it should call the + * <fail_fn> spinlock-based trylock variant unconditionally. */ +static inline int +__mutex_fastpath_trylock(atomic_t *count, int (*fail_fn)(atomic_t *)) +{ + if (likely(cmpxchg_acq(count, 1, 0)) == 1) + return 1; + return 0; +} -#include <asm-generic/mutex-dec.h> +#endif diff --git a/include/asm-ia64/page.h b/include/asm-ia64/page.h index 5e6362a786b..3ab27333dae 100644 --- a/include/asm-ia64/page.h +++ b/include/asm-ia64/page.h @@ -57,6 +57,8 @@ # define HAVE_ARCH_HUGETLB_UNMAPPED_AREA # define ARCH_HAS_HUGEPAGE_ONLY_RANGE +# define ARCH_HAS_PREPARE_HUGEPAGE_RANGE +# define ARCH_HAS_HUGETLB_FREE_PGD_RANGE #endif /* CONFIG_HUGETLB_PAGE */ #ifdef __ASSEMBLY__ diff --git a/include/asm-ia64/pgtable.h b/include/asm-ia64/pgtable.h index e2560c58384..c0f8144f234 100644 --- a/include/asm-ia64/pgtable.h +++ b/include/asm-ia64/pgtable.h @@ -314,7 +314,7 @@ ia64_phys_addr_valid (unsigned long addr) #define pte_mkyoung(pte) (__pte(pte_val(pte) | _PAGE_A)) #define pte_mkclean(pte) (__pte(pte_val(pte) & ~_PAGE_D)) #define pte_mkdirty(pte) (__pte(pte_val(pte) | _PAGE_D)) -#define pte_mkhuge(pte) (__pte(pte_val(pte) | _PAGE_P)) +#define pte_mkhuge(pte) (__pte(pte_val(pte))) /* * Macro to a page protection value as "uncacheable". Note that "protection" is really a @@ -505,9 +505,6 @@ extern struct page *zero_page_memmap_ptr; #define HUGETLB_PGDIR_SHIFT (HPAGE_SHIFT + 2*(PAGE_SHIFT-3)) #define HUGETLB_PGDIR_SIZE (__IA64_UL(1) << HUGETLB_PGDIR_SHIFT) #define HUGETLB_PGDIR_MASK (~(HUGETLB_PGDIR_SIZE-1)) -struct mmu_gather; -void hugetlb_free_pgd_range(struct mmu_gather **tlb, unsigned long addr, - unsigned long end, unsigned long floor, unsigned long ceiling); #endif /* diff --git a/include/asm-ia64/processor.h b/include/asm-ia64/processor.h index 23c8e1be191..128fefd8056 100644 --- a/include/asm-ia64/processor.h +++ b/include/asm-ia64/processor.h @@ -50,7 +50,8 @@ #define IA64_THREAD_PM_VALID (__IA64_UL(1) << 2) /* performance registers valid? */ #define IA64_THREAD_UAC_NOPRINT (__IA64_UL(1) << 3) /* don't log unaligned accesses */ #define IA64_THREAD_UAC_SIGBUS (__IA64_UL(1) << 4) /* generate SIGBUS on unaligned acc. */ - /* bit 5 is currently unused */ +#define IA64_THREAD_MIGRATION (__IA64_UL(1) << 5) /* require migration + sync at ctx sw */ #define IA64_THREAD_FPEMU_NOPRINT (__IA64_UL(1) << 6) /* don't log any fpswa faults */ #define IA64_THREAD_FPEMU_SIGFPE (__IA64_UL(1) << 7) /* send a SIGFPE for fpswa faults */ diff --git a/include/asm-ia64/signal.h b/include/asm-ia64/signal.h index 608168d713d..5e328ed5d01 100644 --- a/include/asm-ia64/signal.h +++ b/include/asm-ia64/signal.h @@ -158,8 +158,6 @@ struct k_sigaction { #define ptrace_signal_deliver(regs, cookie) do { } while (0) -void set_sigdelayed(pid_t pid, int signo, int code, void __user *addr); - #endif /* __KERNEL__ */ # endif /* !__ASSEMBLY__ */ diff --git a/include/asm-ia64/sn/addrs.h b/include/asm-ia64/sn/addrs.h index 2c32e4b77b5..1d9efe54166 100644 --- a/include/asm-ia64/sn/addrs.h +++ b/include/asm-ia64/sn/addrs.h @@ -283,5 +283,13 @@ #define REMOTE_HUB_L(n, a) HUB_L(REMOTE_HUB_ADDR((n), (a))) #define REMOTE_HUB_S(n, a, d) HUB_S(REMOTE_HUB_ADDR((n), (a)), (d)) +/* + * Coretalk address breakdown + */ +#define CTALK_NASID_SHFT 40 +#define CTALK_NASID_MASK (0x3FFFULL << CTALK_NASID_SHFT) +#define CTALK_CID_SHFT 38 +#define CTALK_CID_MASK (0x3ULL << CTALK_CID_SHFT) +#define CTALK_NODE_OFFSET 0x3FFFFFFFFF #endif /* _ASM_IA64_SN_ADDRS_H */ diff --git a/include/asm-ia64/sn/rw_mmr.h b/include/asm-ia64/sn/rw_mmr.h index f40fd1a5510..2d78f4c5a45 100644 --- a/include/asm-ia64/sn/rw_mmr.h +++ b/include/asm-ia64/sn/rw_mmr.h @@ -3,15 +3,14 @@ * License. See the file "COPYING" in the main directory of this archive * for more details. * - * Copyright (C) 2002-2004 Silicon Graphics, Inc. All Rights Reserved. + * Copyright (C) 2002-2006 Silicon Graphics, Inc. All Rights Reserved. */ #ifndef _ASM_IA64_SN_RW_MMR_H #define _ASM_IA64_SN_RW_MMR_H /* - * This file contains macros used to access MMR registers via - * uncached physical addresses. + * This file that access MMRs via uncached physical addresses. * pio_phys_read_mmr - read an MMR * pio_phys_write_mmr - write an MMR * pio_atomic_phys_write_mmrs - atomically write 1 or 2 MMRs with psr.ic=0 @@ -22,53 +21,8 @@ */ -extern inline long -pio_phys_read_mmr(volatile long *mmr) -{ - long val; - asm volatile - ("mov r2=psr;;" - "rsm psr.i | psr.dt;;" - "srlz.i;;" - "ld8.acq %0=[%1];;" - "mov psr.l=r2;;" - "srlz.i;;" - : "=r"(val) - : "r"(mmr) - : "r2"); - return val; -} - - - -extern inline void -pio_phys_write_mmr(volatile long *mmr, long val) -{ - asm volatile - ("mov r2=psr;;" - "rsm psr.i | psr.dt;;" - "srlz.i;;" - "st8.rel [%0]=%1;;" - "mov psr.l=r2;;" - "srlz.i;;" - :: "r"(mmr), "r"(val) - : "r2", "memory"); -} - -extern inline void -pio_atomic_phys_write_mmrs(volatile long *mmr1, long val1, volatile long *mmr2, long val2) -{ - asm volatile - ("mov r2=psr;;" - "rsm psr.i | psr.dt | psr.ic;;" - "cmp.ne p9,p0=%2,r0;" - "srlz.i;;" - "st8.rel [%0]=%1;" - "(p9) st8.rel [%2]=%3;;" - "mov psr.l=r2;;" - "srlz.i;;" - :: "r"(mmr1), "r"(val1), "r"(mmr2), "r"(val2) - : "p9", "r2", "memory"); -} +extern long pio_phys_read_mmr(volatile long *mmr); +extern void pio_phys_write_mmr(volatile long *mmr, long val); +extern void pio_atomic_phys_write_mmrs(volatile long *mmr1, long val1, volatile long *mmr2, long val2); #endif /* _ASM_IA64_SN_RW_MMR_H */ diff --git a/include/asm-ia64/sn/tioce.h b/include/asm-ia64/sn/tioce.h index d4c990712ea..893468e1b41 100644 --- a/include/asm-ia64/sn/tioce.h +++ b/include/asm-ia64/sn/tioce.h @@ -11,7 +11,7 @@ /* CE ASIC part & mfgr information */ #define TIOCE_PART_NUM 0xCE00 -#define TIOCE_MFGR_NUM 0x36 +#define TIOCE_SRC_ID 0x01 #define TIOCE_REV_A 0x1 /* CE Virtual PPB Vendor/Device IDs */ @@ -20,7 +20,7 @@ /* CE Host Bridge Vendor/Device IDs */ #define CE_HOST_BRIDGE_VENDOR_ID 0x10a9 -#define CE_HOST_BRIDGE_DEVICE_ID 0x4003 +#define CE_HOST_BRIDGE_DEVICE_ID 0x4001 #define TIOCE_NUM_M40_ATES 4096 @@ -463,6 +463,25 @@ typedef volatile struct tioce { u64 ce_end_of_struct; /* 0x044400 */ } tioce_t; +/* ce_lsiX_gb_cfg1 register bit masks & shifts */ +#define CE_LSI_GB_CFG1_RXL0S_THS_SHFT 0 +#define CE_LSI_GB_CFG1_RXL0S_THS_MASK (0xffULL << 0) +#define CE_LSI_GB_CFG1_RXL0S_SMP_SHFT 8 +#define CE_LSI_GB_CFG1_RXL0S_SMP_MASK (0xfULL << 8); +#define CE_LSI_GB_CFG1_RXL0S_ADJ_SHFT 12 +#define CE_LSI_GB_CFG1_RXL0S_ADJ_MASK (0x7ULL << 12) +#define CE_LSI_GB_CFG1_RXL0S_FLT_SHFT 15 +#define CE_LSI_GB_CFG1_RXL0S_FLT_MASK (0x1ULL << 15) +#define CE_LSI_GB_CFG1_LPBK_SEL_SHFT 16 +#define CE_LSI_GB_CFG1_LPBK_SEL_MASK (0x3ULL << 16) +#define CE_LSI_GB_CFG1_LPBK_EN_SHFT 18 +#define CE_LSI_GB_CFG1_LPBK_EN_MASK (0x1ULL << 18) +#define CE_LSI_GB_CFG1_RVRS_LB_SHFT 19 +#define CE_LSI_GB_CFG1_RVRS_LB_MASK (0x1ULL << 19) +#define CE_LSI_GB_CFG1_RVRS_CLK_SHFT 20 +#define CE_LSI_GB_CFG1_RVRS_CLK_MASK (0x3ULL << 20) +#define CE_LSI_GB_CFG1_SLF_TS_SHFT 24 +#define CE_LSI_GB_CFG1_SLF_TS_MASK (0xfULL << 24) /* ce_adm_int_mask/ce_adm_int_status register bit defines */ #define CE_ADM_INT_CE_ERROR_SHFT 0 @@ -592,6 +611,11 @@ typedef volatile struct tioce { #define CE_URE_RD_MRG_ENABLE (0x1ULL << 0) #define CE_URE_WRT_MRG_ENABLE1 (0x1ULL << 4) #define CE_URE_WRT_MRG_ENABLE2 (0x1ULL << 5) +#define CE_URE_WRT_MRG_TIMER_SHFT 12 +#define CE_URE_WRT_MRG_TIMER_MASK (0x7FFULL << CE_URE_WRT_MRG_TIMER_SHFT) +#define CE_URE_WRT_MRG_TIMER(x) (((u64)(x) << \ + CE_URE_WRT_MRG_TIMER_SHFT) & \ + CE_URE_WRT_MRG_TIMER_MASK) #define CE_URE_RSPQ_BYPASS_DISABLE (0x1ULL << 24) #define CE_URE_UPS_DAT1_PAR_DISABLE (0x1ULL << 32) #define CE_URE_UPS_HDR1_PAR_DISABLE (0x1ULL << 33) @@ -653,8 +677,12 @@ typedef volatile struct tioce { #define CE_URE_SI (0x1ULL << 0) #define CE_URE_ELAL_SHFT 4 #define CE_URE_ELAL_MASK (0x7ULL << CE_URE_ELAL_SHFT) +#define CE_URE_ELAL_SET(n) (((u64)(n) << CE_URE_ELAL_SHFT) & \ + CE_URE_ELAL_MASK) #define CE_URE_ELAL1_SHFT 8 #define CE_URE_ELAL1_MASK (0x7ULL << CE_URE_ELAL1_SHFT) +#define CE_URE_ELAL1_SET(n) (((u64)(n) << CE_URE_ELAL1_SHFT) & \ + CE_URE_ELAL1_MASK) #define CE_URE_SCC (0x1ULL << 12) #define CE_URE_PN1_SHFT 16 #define CE_URE_PN1_MASK (0xFFULL << CE_URE_PN1_SHFT) @@ -675,8 +703,12 @@ typedef volatile struct tioce { #define CE_URE_HPC (0x1ULL << 6) #define CE_URE_SPLV_SHFT 7 #define CE_URE_SPLV_MASK (0xFFULL << CE_URE_SPLV_SHFT) +#define CE_URE_SPLV_SET(n) (((u64)(n) << CE_URE_SPLV_SHFT) & \ + CE_URE_SPLV_MASK) #define CE_URE_SPLS_SHFT 15 #define CE_URE_SPLS_MASK (0x3ULL << CE_URE_SPLS_SHFT) +#define CE_URE_SPLS_SET(n) (((u64)(n) << CE_URE_SPLS_SHFT) & \ + CE_URE_SPLS_MASK) #define CE_URE_PSN1_SHFT 19 #define CE_URE_PSN1_MASK (0x1FFFULL << CE_URE_PSN1_SHFT) #define CE_URE_PSN2_SHFT 32 diff --git a/include/asm-ia64/sn/xpc.h b/include/asm-ia64/sn/xpc.h index df7f5f4f3cd..aa3b8ace903 100644 --- a/include/asm-ia64/sn/xpc.h +++ b/include/asm-ia64/sn/xpc.h @@ -1227,28 +1227,6 @@ xpc_map_bte_errors(bte_result_t error) -static inline void * -xpc_kmalloc_cacheline_aligned(size_t size, gfp_t flags, void **base) -{ - /* see if kmalloc will give us cachline aligned memory by default */ - *base = kmalloc(size, flags); - if (*base == NULL) { - return NULL; - } - if ((u64) *base == L1_CACHE_ALIGN((u64) *base)) { - return *base; - } - kfree(*base); - - /* nope, we'll have to do it ourselves */ - *base = kmalloc(size + L1_CACHE_BYTES, flags); - if (*base == NULL) { - return NULL; - } - return (void *) L1_CACHE_ALIGN((u64) *base); -} - - /* * Check to see if there is any channel activity to/from the specified * partition. diff --git a/include/asm-ia64/system.h b/include/asm-ia64/system.h index 06253871562..cd4233d66f1 100644 --- a/include/asm-ia64/system.h +++ b/include/asm-ia64/system.h @@ -244,6 +244,13 @@ extern void ia64_load_extra (struct task_struct *task); __ia64_save_fpu((prev)->thread.fph); \ } \ __switch_to(prev, next, last); \ + /* "next" in old context is "current" in new context */ \ + if (unlikely((current->thread.flags & IA64_THREAD_MIGRATION) && \ + (task_cpu(current) != \ + task_thread_info(current)->last_cpu))) { \ + platform_migrate(current); \ + task_thread_info(current)->last_cpu = task_cpu(current); \ + } \ } while (0) #else # define switch_to(prev,next,last) __switch_to(prev, next, last) diff --git a/include/asm-ia64/thread_info.h b/include/asm-ia64/thread_info.h index 1d6518fe1f0..56394a2c705 100644 --- a/include/asm-ia64/thread_info.h +++ b/include/asm-ia64/thread_info.h @@ -26,16 +26,10 @@ struct thread_info { struct exec_domain *exec_domain;/* execution domain */ __u32 flags; /* thread_info flags (see TIF_*) */ __u32 cpu; /* current CPU */ + __u32 last_cpu; /* Last CPU thread ran on */ mm_segment_t addr_limit; /* user-level address space limit */ int preempt_count; /* 0=premptable, <0=BUG; will also serve as bh-counter */ struct restart_block restart_block; - struct { - int signo; - int code; - void __user *addr; - unsigned long start_time; - pid_t pid; - } sigdelayed; /* Saved information for TIF_SIGDELAYED */ }; #define THREAD_SIZE KERNEL_STACK_SIZE @@ -89,7 +83,6 @@ struct thread_info { #define TIF_NEED_RESCHED 2 /* rescheduling necessary */ #define TIF_SYSCALL_TRACE 3 /* syscall trace active */ #define TIF_SYSCALL_AUDIT 4 /* syscall auditing active */ -#define TIF_SIGDELAYED 5 /* signal delayed from MCA/INIT/NMI/PMI context */ #define TIF_POLLING_NRFLAG 16 /* true if poll_idle() is polling TIF_NEED_RESCHED */ #define TIF_MEMDIE 17 #define TIF_MCA_INIT 18 /* this task is processing MCA or INIT */ @@ -101,13 +94,12 @@ struct thread_info { #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) #define _TIF_SIGPENDING (1 << TIF_SIGPENDING) #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) -#define _TIF_SIGDELAYED (1 << TIF_SIGDELAYED) #define _TIF_POLLING_NRFLAG (1 << TIF_POLLING_NRFLAG) #define _TIF_MCA_INIT (1 << TIF_MCA_INIT) #define _TIF_DB_DISABLED (1 << TIF_DB_DISABLED) /* "work to do on user-return" bits */ -#define TIF_ALLWORK_MASK (_TIF_NOTIFY_RESUME|_TIF_SIGPENDING|_TIF_NEED_RESCHED|_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SIGDELAYED) +#define TIF_ALLWORK_MASK (_TIF_NOTIFY_RESUME|_TIF_SIGPENDING|_TIF_NEED_RESCHED|_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT) /* like TIF_ALLWORK_BITS but sans TIF_SYSCALL_TRACE or TIF_SYSCALL_AUDIT */ #define TIF_WORK_MASK (TIF_ALLWORK_MASK&~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT)) diff --git a/include/asm-powerpc/pgtable.h b/include/asm-powerpc/pgtable.h index e38931379a7..185ee15963a 100644 --- a/include/asm-powerpc/pgtable.h +++ b/include/asm-powerpc/pgtable.h @@ -468,11 +468,6 @@ extern pgd_t swapper_pg_dir[]; extern void paging_init(void); -#ifdef CONFIG_HUGETLB_PAGE -#define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) \ - free_pgd_range(tlb, addr, end, floor, ceiling) -#endif - /* * This gets called at the end of handling a page fault, when * the kernel has put a new PTE into the page table for the process. diff --git a/include/asm-s390/pgalloc.h b/include/asm-s390/pgalloc.h index 3417dd71ab4..e28aaf28e4a 100644 --- a/include/asm-s390/pgalloc.h +++ b/include/asm-s390/pgalloc.h @@ -158,11 +158,4 @@ static inline void pte_free(struct page *pte) #define __pte_free_tlb(tlb,pte) tlb_remove_page(tlb,pte) -/* - * This establishes kernel virtual mappings (e.g., as a result of a - * vmalloc call). Since s390-esame uses a separate kernel page table, - * there is nothing to do here... :) - */ -#define set_pgdir(addr,entry) do { } while(0) - #endif /* _S390_PGALLOC_H */ diff --git a/include/asm-sh64/pgalloc.h b/include/asm-sh64/pgalloc.h index 678251ac1db..b29dd468817 100644 --- a/include/asm-sh64/pgalloc.h +++ b/include/asm-sh64/pgalloc.h @@ -167,22 +167,6 @@ static __inline__ void pmd_free(pmd_t *pmd) extern int do_check_pgt_cache(int, int); -static inline void set_pgdir(unsigned long address, pgd_t entry) -{ - struct task_struct * p; - pgd_t *pgd; - - read_lock(&tasklist_lock); - for_each_process(p) { - if (!p->mm) - continue; - *pgd_offset(p->mm,address) = entry; - } - read_unlock(&tasklist_lock); - for (pgd = (pgd_t *)pgd_quicklist; pgd; pgd = (pgd_t *)*(unsigned long *)pgd) - pgd[address >> PGDIR_SHIFT] = entry; -} - #define pmd_populate_kernel(mm, pmd, pte) \ set_pmd(pmd, __pmd(_PAGE_TABLE + (unsigned long) (pte))) diff --git a/include/asm-x86_64/pgtable.h b/include/asm-x86_64/pgtable.h index 715fd94cf57..a617d364d08 100644 --- a/include/asm-x86_64/pgtable.h +++ b/include/asm-x86_64/pgtable.h @@ -273,7 +273,7 @@ static inline int pte_dirty(pte_t pte) { return pte_val(pte) & _PAGE_DIRTY; } static inline int pte_young(pte_t pte) { return pte_val(pte) & _PAGE_ACCESSED; } static inline int pte_write(pte_t pte) { return pte_val(pte) & _PAGE_RW; } static inline int pte_file(pte_t pte) { return pte_val(pte) & _PAGE_FILE; } -static inline int pte_huge(pte_t pte) { return (pte_val(pte) & __LARGE_PTE) == __LARGE_PTE; } +static inline int pte_huge(pte_t pte) { return pte_val(pte) & _PAGE_PSE; } static inline pte_t pte_rdprotect(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) & ~_PAGE_USER)); return pte; } static inline pte_t pte_exprotect(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) & ~_PAGE_USER)); return pte; } @@ -285,7 +285,7 @@ static inline pte_t pte_mkexec(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) | _ static inline pte_t pte_mkdirty(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) | _PAGE_DIRTY)); return pte; } static inline pte_t pte_mkyoung(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) | _PAGE_ACCESSED)); return pte; } static inline pte_t pte_mkwrite(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) | _PAGE_RW)); return pte; } -static inline pte_t pte_mkhuge(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) | __LARGE_PTE)); return pte; } +static inline pte_t pte_mkhuge(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) | _PAGE_PSE)); return pte; } struct vm_area_struct; diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 68d82ad6b17..d6f1019625a 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -20,10 +20,7 @@ void unmap_hugepage_range(struct vm_area_struct *, unsigned long, unsigned long) int hugetlb_prefault(struct address_space *, struct vm_area_struct *); int hugetlb_report_meminfo(char *); int hugetlb_report_node_meminfo(int, char *); -int is_hugepage_mem_enough(size_t); unsigned long hugetlb_total_pages(void); -struct page *alloc_huge_page(struct vm_area_struct *, unsigned long); -void free_huge_page(struct page *); int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, int write_access); @@ -39,18 +36,35 @@ struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address, int write); struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address, pmd_t *pmd, int write); -int is_aligned_hugepage_range(unsigned long addr, unsigned long len); int pmd_huge(pmd_t pmd); +void hugetlb_change_protection(struct vm_area_struct *vma, + unsigned long address, unsigned long end, pgprot_t newprot); #ifndef ARCH_HAS_HUGEPAGE_ONLY_RANGE #define is_hugepage_only_range(mm, addr, len) 0 -#define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) \ - do { } while (0) +#endif + +#ifndef ARCH_HAS_HUGETLB_FREE_PGD_RANGE +#define hugetlb_free_pgd_range free_pgd_range +#else +void hugetlb_free_pgd_range(struct mmu_gather **tlb, unsigned long addr, + unsigned long end, unsigned long floor, + unsigned long ceiling); #endif #ifndef ARCH_HAS_PREPARE_HUGEPAGE_RANGE -#define prepare_hugepage_range(addr, len) \ - is_aligned_hugepage_range(addr, len) +/* + * If the arch doesn't supply something else, assume that hugepage + * size aligned regions are ok without further preparation. + */ +static inline int prepare_hugepage_range(unsigned long addr, unsigned long len) +{ + if (len & ~HPAGE_MASK) + return -EINVAL; + if (addr & ~HPAGE_MASK) + return -EINVAL; + return 0; +} #else int prepare_hugepage_range(unsigned long addr, unsigned long len); #endif @@ -87,20 +101,17 @@ static inline unsigned long hugetlb_total_pages(void) #define copy_hugetlb_page_range(src, dst, vma) ({ BUG(); 0; }) #define hugetlb_prefault(mapping, vma) ({ BUG(); 0; }) #define unmap_hugepage_range(vma, start, end) BUG() -#define is_hugepage_mem_enough(size) 0 #define hugetlb_report_meminfo(buf) 0 #define hugetlb_report_node_meminfo(n, buf) 0 #define follow_huge_pmd(mm, addr, pmd, write) NULL -#define is_aligned_hugepage_range(addr, len) 0 #define prepare_hugepage_range(addr, len) (-EINVAL) #define pmd_huge(x) 0 #define is_hugepage_only_range(mm, addr, len) 0 -#define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) \ - do { } while (0) -#define alloc_huge_page(vma, addr) ({ NULL; }) -#define free_huge_page(p) ({ (void)(p); BUG(); }) +#define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) ({BUG(); 0; }) #define hugetlb_fault(mm, vma, addr, write) ({ BUG(); 0; }) +#define hugetlb_change_protection(vma, address, end, newprot) + #ifndef HPAGE_MASK #define HPAGE_MASK PAGE_MASK /* Keep the compiler happy */ #define HPAGE_SIZE PAGE_SIZE @@ -128,6 +139,8 @@ struct hugetlbfs_sb_info { struct hugetlbfs_inode_info { struct shared_policy policy; + /* Protected by the (global) hugetlb_lock */ + unsigned long prereserved_hpages; struct inode vfs_inode; }; @@ -144,6 +157,10 @@ static inline struct hugetlbfs_sb_info *HUGETLBFS_SB(struct super_block *sb) extern struct file_operations hugetlbfs_file_operations; extern struct vm_operations_struct hugetlb_vm_ops; struct file *hugetlb_zero_setup(size_t); +int hugetlb_extend_reservation(struct hugetlbfs_inode_info *info, + unsigned long atleast_hpages); +void hugetlb_truncate_reservation(struct hugetlbfs_inode_info *info, + unsigned long atmost_hpages); int hugetlb_get_quota(struct address_space *mapping); void hugetlb_put_quota(struct address_space *mapping); diff --git a/include/linux/migrate.h b/include/linux/migrate.h new file mode 100644 index 00000000000..7d09962c3c0 --- /dev/null +++ b/include/linux/migrate.h @@ -0,0 +1,36 @@ +#ifndef _LINUX_MIGRATE_H +#define _LINUX_MIGRATE_H + +#include <linux/config.h> +#include <linux/mm.h> + +#ifdef CONFIG_MIGRATION +extern int isolate_lru_page(struct page *p, struct list_head *pagelist); +extern int putback_lru_pages(struct list_head *l); +extern int migrate_page(struct page *, struct page *); +extern void migrate_page_copy(struct page *, struct page *); +extern int migrate_page_remove_references(struct page *, struct page *, int); +extern int migrate_pages(struct list_head *l, struct list_head *t, + struct list_head *moved, struct list_head *failed); +int migrate_pages_to(struct list_head *pagelist, + struct vm_area_struct *vma, int dest); +extern int fail_migrate_page(struct page *, struct page *); + +extern int migrate_prep(void); + +#else + +static inline int isolate_lru_page(struct page *p, struct list_head *list) + { return -ENOSYS; } +static inline int putback_lru_pages(struct list_head *l) { return 0; } +static inline int migrate_pages(struct list_head *l, struct list_head *t, + struct list_head *moved, struct list_head *failed) { return -ENOSYS; } + +static inline int migrate_prep(void) { return -ENOSYS; } + +/* Possible settings for the migrate_page() method in address_operations */ +#define migrate_page NULL +#define fail_migrate_page NULL + +#endif /* CONFIG_MIGRATION */ +#endif /* _LINUX_MIGRATE_H */ diff --git a/include/linux/mm.h b/include/linux/mm.h index 498ff8778fb..6aa016f1d3a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -286,43 +286,34 @@ struct page { * * Also, many kernel routines increase the page count before a critical * routine so they can be sure the page doesn't go away from under them. - * - * Since 2.6.6 (approx), a free page has ->_count = -1. This is so that we - * can use atomic_add_negative(-1, page->_count) to detect when the page - * becomes free and so that we can also use atomic_inc_and_test to atomically - * detect when we just tried to grab a ref on a page which some other CPU has - * already deemed to be freeable. - * - * NO code should make assumptions about this internal detail! Use the provided - * macros which retain the old rules: page_count(page) == 0 is a free page. */ /* * Drop a ref, return true if the logical refcount fell to zero (the page has * no users) */ -#define put_page_testzero(p) \ - ({ \ - BUG_ON(atomic_read(&(p)->_count) == -1);\ - atomic_add_negative(-1, &(p)->_count); \ - }) +static inline int put_page_testzero(struct page *page) +{ + BUG_ON(atomic_read(&page->_count) == 0); + return atomic_dec_and_test(&page->_count); +} /* - * Grab a ref, return true if the page previously had a logical refcount of - * zero. ie: returns true if we just grabbed an already-deemed-to-be-free page + * Try to grab a ref unless the page has a refcount of zero, return false if + * that is the case. */ -#define get_page_testone(p) atomic_inc_and_test(&(p)->_count) - -#define set_page_count(p,v) atomic_set(&(p)->_count, (v) - 1) -#define __put_page(p) atomic_dec(&(p)->_count) +static inline int get_page_unless_zero(struct page *page) +{ + return atomic_inc_not_zero(&page->_count); +} extern void FASTCALL(__page_cache_release(struct page *)); static inline int page_count(struct page *page) { - if (PageCompound(page)) + if (unlikely(PageCompound(page))) page = (struct page *)page_private(page); - return atomic_read(&page->_count) + 1; + return atomic_read(&page->_count); } static inline void get_page(struct page *page) @@ -332,8 +323,19 @@ static inline void get_page(struct page *page) atomic_inc(&page->_count); } +/* + * Setup the page count before being freed into the page allocator for + * the first time (boot or memory hotplug) + */ +static inline void init_page_count(struct page *page) +{ + atomic_set(&page->_count, 1); +} + void put_page(struct page *page); +void split_page(struct page *page, unsigned int order); + /* * Multiple processes may "see" the same page. E.g. for untouched * mappings of /dev/null, all processes see the same page full of @@ -1046,7 +1048,7 @@ int in_gate_area_no_task(unsigned long addr); int drop_caches_sysctl_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *); -int shrink_slab(unsigned long scanned, gfp_t gfp_mask, +unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, unsigned long lru_pages); void drop_pagecache(void); void drop_slab(void); diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 8ac854f7f19..3b6723dfaff 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -32,7 +32,7 @@ del_page_from_lru(struct zone *zone, struct page *page) { list_del(&page->lru); if (PageActive(page)) { - ClearPageActive(page); + __ClearPageActive(page); zone->nr_active--; } else { zone->nr_inactive--; diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index d52999c4333..9ea629c02a4 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -86,8 +86,9 @@ * - The __xxx_page_state variants can be used safely when interrupts are * disabled. * - The __xxx_page_state variants can be used if the field is only - * modified from process context, or only modified from interrupt context. - * In this case, the field should be commented here. + * modified from process context and protected from preemption, or only + * modified from interrupt context. In this case, the field should be + * commented here. */ struct page_state { unsigned long nr_dirty; /* Dirty writeable pages */ @@ -239,22 +240,19 @@ extern void __mod_page_state_offset(unsigned long offset, unsigned long delta); #define __ClearPageDirty(page) __clear_bit(PG_dirty, &(page)->flags) #define TestClearPageDirty(page) test_and_clear_bit(PG_dirty, &(page)->flags) -#define SetPageLRU(page) set_bit(PG_lru, &(page)->flags) #define PageLRU(page) test_bit(PG_lru, &(page)->flags) -#define TestSetPageLRU(page) test_and_set_bit(PG_lru, &(page)->flags) -#define TestClearPageLRU(page) test_and_clear_bit(PG_lru, &(page)->flags) +#define SetPageLRU(page) set_bit(PG_lru, &(page)->flags) +#define ClearPageLRU(page) clear_bit(PG_lru, &(page)->flags) +#define __ClearPageLRU(page) __clear_bit(PG_lru, &(page)->flags) #define PageActive(page) test_bit(PG_active, &(page)->flags) #define SetPageActive(page) set_bit(PG_active, &(page)->flags) #define ClearPageActive(page) clear_bit(PG_active, &(page)->flags) -#define TestClearPageActive(page) test_and_clear_bit(PG_active, &(page)->flags) -#define TestSetPageActive(page) test_and_set_bit(PG_active, &(page)->flags) +#define __ClearPageActive(page) __clear_bit(PG_active, &(page)->flags) #define PageSlab(page) test_bit(PG_slab, &(page)->flags) -#define SetPageSlab(page) set_bit(PG_slab, &(page)->flags) -#define ClearPageSlab(page) clear_bit(PG_slab, &(page)->flags) -#define TestClearPageSlab(page) test_and_clear_bit(PG_slab, &(page)->flags) -#define TestSetPageSlab(page) test_and_set_bit(PG_slab, &(page)->flags) +#define __SetPageSlab(page) __set_bit(PG_slab, &(page)->flags) +#define __ClearPageSlab(page) __clear_bit(PG_slab, &(page)->flags) #ifdef CONFIG_HIGHMEM #define PageHighMem(page) is_highmem(page_zone(page)) @@ -329,8 +327,8 @@ extern void __mod_page_state_offset(unsigned long offset, unsigned long delta); #define TestClearPageReclaim(page) test_and_clear_bit(PG_reclaim, &(page)->flags) #define PageCompound(page) test_bit(PG_compound, &(page)->flags) -#define SetPageCompound(page) set_bit(PG_compound, &(page)->flags) -#define ClearPageCompound(page) clear_bit(PG_compound, &(page)->flags) +#define __SetPageCompound(page) __set_bit(PG_compound, &(page)->flags) +#define __ClearPageCompound(page) __clear_bit(PG_compound, &(page)->flags) #ifdef CONFIG_SWAP #define PageSwapCache(page) test_bit(PG_swapcache, &(page)->flags) diff --git a/include/linux/rtc.h b/include/linux/rtc.h index 0b2ba67ff13..b739ac1f7ca 100644 --- a/include/linux/rtc.h +++ b/include/linux/rtc.h @@ -11,8 +11,6 @@ #ifndef _LINUX_RTC_H_ #define _LINUX_RTC_H_ -#include <linux/interrupt.h> - /* * The struct used to pass data via the following ioctl. Similar to the * struct tm in <time.h>, but it needs to be here so that the kernel @@ -95,6 +93,8 @@ struct rtc_pll_info { #ifdef __KERNEL__ +#include <linux/interrupt.h> + typedef struct rtc_task { void (*func)(void *private_data); void *private_data; diff --git a/include/linux/slab.h b/include/linux/slab.h index 8cf52939d0a..2b28c849d75 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -38,7 +38,6 @@ typedef struct kmem_cache kmem_cache_t; #define SLAB_DEBUG_INITIAL 0x00000200UL /* Call constructor (as verifier) */ #define SLAB_RED_ZONE 0x00000400UL /* Red zone objs in a cache */ #define SLAB_POISON 0x00000800UL /* Poison objects */ -#define SLAB_NO_REAP 0x00001000UL /* never reap from the cache */ #define SLAB_HWCACHE_ALIGN 0x00002000UL /* align objs on a h/w cache lines */ #define SLAB_CACHE_DMA 0x00004000UL /* use GFP_DMA memory */ #define SLAB_MUST_HWCACHE_ALIGN 0x00008000UL /* force alignment */ @@ -118,7 +117,7 @@ extern void *kzalloc(size_t, gfp_t); */ static inline void *kcalloc(size_t n, size_t size, gfp_t flags) { - if (n != 0 && size > INT_MAX / n) + if (n != 0 && size > ULONG_MAX / n) return NULL; return kzalloc(n * size, flags); } diff --git a/include/linux/smp.h b/include/linux/smp.h index 44153fdf73f..d699a16b0cb 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -52,23 +52,12 @@ extern void smp_cpus_done(unsigned int max_cpus); /* * Call a function on all other processors */ -extern int smp_call_function (void (*func) (void *info), void *info, - int retry, int wait); +int smp_call_function(void(*func)(void *info), void *info, int retry, int wait); /* * Call a function on all processors */ -static inline int on_each_cpu(void (*func) (void *info), void *info, - int retry, int wait) -{ - int ret = 0; - - preempt_disable(); - ret = smp_call_function(func, info, retry, wait); - func(info); - preempt_enable(); - return ret; -} +int on_each_cpu(void (*func) (void *info), void *info, int retry, int wait); #define MSG_ALL_BUT_SELF 0x8000 /* Assume <32768 CPU's */ #define MSG_ALL 0x8001 @@ -94,7 +83,13 @@ void smp_prepare_boot_cpu(void); #define raw_smp_processor_id() 0 #define hard_smp_processor_id() 0 #define smp_call_function(func,info,retry,wait) ({ 0; }) -#define on_each_cpu(func,info,retry,wait) ({ func(info); 0; }) +#define on_each_cpu(func,info,retry,wait) \ + ({ \ + local_irq_disable(); \ + func(info); \ + local_irq_enable(); \ + 0; \ + }) static inline void smp_send_reschedule(int cpu) { } #define num_booting_cpus() 1 #define smp_prepare_boot_cpu() do {} while (0) diff --git a/include/linux/swap.h b/include/linux/swap.h index d572b19afb7..12415dd9445 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -172,9 +172,24 @@ extern int rotate_reclaimable_page(struct page *page); extern void swap_setup(void); /* linux/mm/vmscan.c */ -extern int try_to_free_pages(struct zone **, gfp_t); -extern int shrink_all_memory(int); +extern unsigned long try_to_free_pages(struct zone **, gfp_t); +extern unsigned long shrink_all_memory(unsigned long nr_pages); extern int vm_swappiness; +extern int remove_mapping(struct address_space *mapping, struct page *page); + +/* possible outcome of pageout() */ +typedef enum { + /* failed to write page out, page is locked */ + PAGE_KEEP, + /* move page to the active list, page is locked */ + PAGE_ACTIVATE, + /* page has been sent to the disk successfully, page is unlocked */ + PAGE_SUCCESS, + /* page is clean and locked */ + PAGE_CLEAN, +} pageout_t; + +extern pageout_t pageout(struct page *page, struct address_space *mapping); #ifdef CONFIG_NUMA extern int zone_reclaim_mode; @@ -188,25 +203,6 @@ static inline int zone_reclaim(struct zone *z, gfp_t mask, unsigned int order) } #endif -#ifdef CONFIG_MIGRATION -extern int isolate_lru_page(struct page *p); -extern int putback_lru_pages(struct list_head *l); -extern int migrate_page(struct page *, struct page *); -extern void migrate_page_copy(struct page *, struct page *); -extern int migrate_page_remove_references(struct page *, struct page *, int); -extern int migrate_pages(struct list_head *l, struct list_head *t, - struct list_head *moved, struct list_head *failed); -extern int fail_migrate_page(struct page *, struct page *); -#else -static inline int isolate_lru_page(struct page *p) { return -ENOSYS; } -static inline int putback_lru_pages(struct list_head *l) { return 0; } -static inline int migrate_pages(struct list_head *l, struct list_head *t, - struct list_head *moved, struct list_head *failed) { return -ENOSYS; } -/* Possible settings for the migrate_page() method in address_operations */ -#define migrate_page NULL -#define fail_migrate_page NULL -#endif - #ifdef CONFIG_MMU /* linux/mm/shmem.c */ extern int shmem_unuse(swp_entry_t entry, struct page *page); diff --git a/kernel/fork.c b/kernel/fork.c index b373322ca49..9bd7b65ee41 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1534,6 +1534,12 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) check_unshare_flags(&unshare_flags); + /* Return -EINVAL for all unsupported flags */ + err = -EINVAL; + if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| + CLONE_VM|CLONE_FILES|CLONE_SYSVSEM)) + goto bad_unshare_out; + if ((err = unshare_thread(unshare_flags))) goto bad_unshare_out; if ((err = unshare_fs(unshare_flags, &new_fs))) diff --git a/kernel/sched.c b/kernel/sched.c index 4d46e90f59c..6b6e0d70eb3 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -707,12 +707,6 @@ static int recalc_task_prio(task_t *p, unsigned long long now) DEF_TIMESLICE); } else { /* - * The lower the sleep avg a task has the more - * rapidly it will rise with sleep time. - */ - sleep_time *= (MAX_BONUS - CURRENT_BONUS(p)) ? : 1; - - /* * Tasks waking from uninterruptible sleep are * limited in their sleep_avg rise as they * are likely to be waiting on I/O diff --git a/kernel/softirq.c b/kernel/softirq.c index ad3295cdded..ec8fed42a86 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -16,6 +16,7 @@ #include <linux/cpu.h> #include <linux/kthread.h> #include <linux/rcupdate.h> +#include <linux/smp.h> #include <asm/irq.h> /* @@ -495,3 +496,22 @@ __init int spawn_ksoftirqd(void) register_cpu_notifier(&cpu_nfb); return 0; } + +#ifdef CONFIG_SMP +/* + * Call a function on all processors + */ +int on_each_cpu(void (*func) (void *info), void *info, int retry, int wait) +{ + int ret = 0; + + preempt_disable(); + ret = smp_call_function(func, info, retry, wait); + local_irq_disable(); + func(info); + local_irq_enable(); + preempt_enable(); + return ret; +} +EXPORT_SYMBOL(on_each_cpu); +#endif diff --git a/lib/string.c b/lib/string.c index 037a48acedb..b3c28a3f633 100644 --- a/lib/string.c +++ b/lib/string.c @@ -403,7 +403,6 @@ char *strpbrk(const char *cs, const char *ct) } return NULL; } -EXPORT_SYMBOL(strpbrk); #endif #ifndef __HAVE_ARCH_STRSEP diff --git a/mm/Kconfig b/mm/Kconfig index a9cb80ae640..bd80460360d 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -137,5 +137,11 @@ config SPLIT_PTLOCK_CPUS # support for page migration # config MIGRATION + bool "Page migration" def_bool y if NUMA || SPARSEMEM || DISCONTIGMEM depends on SWAP + help + Allows the migration of the physical location of pages of processes + while the virtual addresses are not changed. This is useful for + example on NUMA systems to put pages nearer to the processors accessing + the page. diff --git a/mm/Makefile b/mm/Makefile index 9aa03fa1dcc..f10c753dce6 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -22,3 +22,5 @@ obj-$(CONFIG_SLOB) += slob.o obj-$(CONFIG_SLAB) += slab.o obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o obj-$(CONFIG_FS_XIP) += filemap_xip.o +obj-$(CONFIG_MIGRATION) += migrate.o + diff --git a/mm/filemap.c b/mm/filemap.c index 44da3d47699..e8f58f7dd7a 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -30,6 +30,8 @@ #include <linux/security.h> #include <linux/syscalls.h> #include "filemap.h" +#include "internal.h" + /* * FIXME: remove all knowledge of the buffer layer from the core VM */ diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 508707704d2..ebad6bbb350 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -13,24 +13,48 @@ #include <linux/pagemap.h> #include <linux/mempolicy.h> #include <linux/cpuset.h> +#include <linux/mutex.h> #include <asm/page.h> #include <asm/pgtable.h> #include <linux/hugetlb.h> +#include "internal.h" const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; -static unsigned long nr_huge_pages, free_huge_pages; +static unsigned long nr_huge_pages, free_huge_pages, reserved_huge_pages; unsigned long max_huge_pages; static struct list_head hugepage_freelists[MAX_NUMNODES]; static unsigned int nr_huge_pages_node[MAX_NUMNODES]; static unsigned int free_huge_pages_node[MAX_NUMNODES]; - /* * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages */ static DEFINE_SPINLOCK(hugetlb_lock); +static void clear_huge_page(struct page *page, unsigned long addr) +{ + int i; + + might_sleep(); + for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); i++) { + cond_resched(); + clear_user_highpage(page + i, addr); + } +} + +static void copy_huge_page(struct page *dst, struct page *src, + unsigned long addr) +{ + int i; + + might_sleep(); + for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) { + cond_resched(); + copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE); + } +} + static void enqueue_huge_page(struct page *page) { int nid = page_to_nid(page); @@ -64,57 +88,176 @@ static struct page *dequeue_huge_page(struct vm_area_struct *vma, return page; } -static struct page *alloc_fresh_huge_page(void) +static void free_huge_page(struct page *page) +{ + BUG_ON(page_count(page)); + + INIT_LIST_HEAD(&page->lru); + + spin_lock(&hugetlb_lock); + enqueue_huge_page(page); + spin_unlock(&hugetlb_lock); +} + +static int alloc_fresh_huge_page(void) { static int nid = 0; struct page *page; page = alloc_pages_node(nid, GFP_HIGHUSER|__GFP_COMP|__GFP_NOWARN, HUGETLB_PAGE_ORDER); - nid = (nid + 1) % num_online_nodes(); + nid = next_node(nid, node_online_map); + if (nid == MAX_NUMNODES) + nid = first_node(node_online_map); if (page) { + page[1].lru.next = (void *)free_huge_page; /* dtor */ spin_lock(&hugetlb_lock); nr_huge_pages++; nr_huge_pages_node[page_to_nid(page)]++; spin_unlock(&hugetlb_lock); + put_page(page); /* free it into the hugepage allocator */ + return 1; } - return page; + return 0; } -void free_huge_page(struct page *page) +static struct page *alloc_huge_page(struct vm_area_struct *vma, + unsigned long addr) { - BUG_ON(page_count(page)); + struct inode *inode = vma->vm_file->f_dentry->d_inode; + struct page *page; + int use_reserve = 0; + unsigned long idx; - INIT_LIST_HEAD(&page->lru); - page[1].lru.next = NULL; /* reset dtor */ + spin_lock(&hugetlb_lock); + + if (vma->vm_flags & VM_MAYSHARE) { + + /* idx = radix tree index, i.e. offset into file in + * HPAGE_SIZE units */ + idx = ((addr - vma->vm_start) >> HPAGE_SHIFT) + + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT)); + + /* The hugetlbfs specific inode info stores the number + * of "guaranteed available" (huge) pages. That is, + * the first 'prereserved_hpages' pages of the inode + * are either already instantiated, or have been + * pre-reserved (by hugetlb_reserve_for_inode()). Here + * we're in the process of instantiating the page, so + * we use this to determine whether to draw from the + * pre-reserved pool or the truly free pool. */ + if (idx < HUGETLBFS_I(inode)->prereserved_hpages) + use_reserve = 1; + } + + if (!use_reserve) { + if (free_huge_pages <= reserved_huge_pages) + goto fail; + } else { + BUG_ON(reserved_huge_pages == 0); + reserved_huge_pages--; + } + + page = dequeue_huge_page(vma, addr); + if (!page) + goto fail; + + spin_unlock(&hugetlb_lock); + set_page_refcounted(page); + return page; + + fail: + WARN_ON(use_reserve); /* reserved allocations shouldn't fail */ + spin_unlock(&hugetlb_lock); + return NULL; +} + +/* hugetlb_extend_reservation() + * + * Ensure that at least 'atleast' hugepages are, and will remain, + * available to instantiate the first 'atleast' pages of the given + * inode. If the inode doesn't already have this many pages reserved + * or instantiated, set aside some hugepages in the reserved pool to + * satisfy later faults (or fail now if there aren't enough, rather + * than getting the SIGBUS later). + */ +int hugetlb_extend_reservation(struct hugetlbfs_inode_info *info, + unsigned long atleast) +{ + struct inode *inode = &info->vfs_inode; + unsigned long change_in_reserve = 0; + int ret = 0; spin_lock(&hugetlb_lock); - enqueue_huge_page(page); + read_lock_irq(&inode->i_mapping->tree_lock); + + if (info->prereserved_hpages >= atleast) + goto out; + + /* Because we always call this on shared mappings, none of the + * pages beyond info->prereserved_hpages can have been + * instantiated, so we need to reserve all of them now. */ + change_in_reserve = atleast - info->prereserved_hpages; + + if ((reserved_huge_pages + change_in_reserve) > free_huge_pages) { + ret = -ENOMEM; + goto out; + } + + reserved_huge_pages += change_in_reserve; + info->prereserved_hpages = atleast; + + out: + read_unlock_irq(&inode->i_mapping->tree_lock); spin_unlock(&hugetlb_lock); + + return ret; } -struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr) +/* hugetlb_truncate_reservation() + * + * This returns pages reserved for the given inode to the general free + * hugepage pool. If the inode has any pages prereserved, but not + * instantiated, beyond offset (atmost << HPAGE_SIZE), then release + * them. + */ +void hugetlb_truncate_reservation(struct hugetlbfs_inode_info *info, + unsigned long atmost) { + struct inode *inode = &info->vfs_inode; + struct address_space *mapping = inode->i_mapping; + unsigned long idx; + unsigned long change_in_reserve = 0; struct page *page; - int i; spin_lock(&hugetlb_lock); - page = dequeue_huge_page(vma, addr); - if (!page) { - spin_unlock(&hugetlb_lock); - return NULL; + read_lock_irq(&inode->i_mapping->tree_lock); + + if (info->prereserved_hpages <= atmost) + goto out; + + /* Count pages which were reserved, but not instantiated, and + * which we can now release. */ + for (idx = atmost; idx < info->prereserved_hpages; idx++) { + page = radix_tree_lookup(&mapping->page_tree, idx); + if (!page) + /* Pages which are already instantiated can't + * be unreserved (and in fact have already + * been removed from the reserved pool) */ + change_in_reserve++; } + + BUG_ON(reserved_huge_pages < change_in_reserve); + reserved_huge_pages -= change_in_reserve; + info->prereserved_hpages = atmost; + + out: + read_unlock_irq(&inode->i_mapping->tree_lock); spin_unlock(&hugetlb_lock); - set_page_count(page, 1); - page[1].lru.next = (void *)free_huge_page; /* set dtor */ - for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); ++i) - clear_user_highpage(&page[i], addr); - return page; } static int __init hugetlb_init(void) { unsigned long i; - struct page *page; if (HPAGE_SHIFT == 0) return 0; @@ -123,12 +266,8 @@ static int __init hugetlb_init(void) INIT_LIST_HEAD(&hugepage_freelists[i]); for (i = 0; i < max_huge_pages; ++i) { - page = alloc_fresh_huge_page(); - if (!page) + if (!alloc_fresh_huge_page()) break; - spin_lock(&hugetlb_lock); - enqueue_huge_page(page); - spin_unlock(&hugetlb_lock); } max_huge_pages = free_huge_pages = nr_huge_pages = i; printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages); @@ -154,9 +293,9 @@ static void update_and_free_page(struct page *page) page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced | 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved | 1 << PG_private | 1<< PG_writeback); - set_page_count(&page[i], 0); } - set_page_count(page, 1); + page[1].lru.next = NULL; + set_page_refcounted(page); __free_pages(page, HUGETLB_PAGE_ORDER); } @@ -188,12 +327,8 @@ static inline void try_to_free_low(unsigned long count) static unsigned long set_max_huge_pages(unsigned long count) { while (count > nr_huge_pages) { - struct page *page = alloc_fresh_huge_page(); - if (!page) + if (!alloc_fresh_huge_page()) return nr_huge_pages; - spin_lock(&hugetlb_lock); - enqueue_huge_page(page); - spin_unlock(&hugetlb_lock); } if (count >= nr_huge_pages) return nr_huge_pages; @@ -225,9 +360,11 @@ int hugetlb_report_meminfo(char *buf) return sprintf(buf, "HugePages_Total: %5lu\n" "HugePages_Free: %5lu\n" + "HugePages_Rsvd: %5lu\n" "Hugepagesize: %5lu kB\n", nr_huge_pages, free_huge_pages, + reserved_huge_pages, HPAGE_SIZE/1024); } @@ -240,11 +377,6 @@ int hugetlb_report_node_meminfo(int nid, char *buf) nid, free_huge_pages_node[nid]); } -int is_hugepage_mem_enough(size_t size) -{ - return (size + ~HPAGE_MASK)/HPAGE_SIZE <= free_huge_pages; -} - /* Return the number pages of memory we physically have, in PAGE_SIZE units. */ unsigned long hugetlb_total_pages(void) { @@ -374,7 +506,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *ptep, pte_t pte) { struct page *old_page, *new_page; - int i, avoidcopy; + int avoidcopy; old_page = pte_page(pte); @@ -395,9 +527,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, } spin_unlock(&mm->page_table_lock); - for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) - copy_user_highpage(new_page + i, old_page + i, - address + i*PAGE_SIZE); + copy_huge_page(new_page, old_page, address); spin_lock(&mm->page_table_lock); ptep = huge_pte_offset(mm, address & HPAGE_MASK); @@ -442,6 +572,7 @@ retry: ret = VM_FAULT_OOM; goto out; } + clear_huge_page(page, address); if (vma->vm_flags & VM_SHARED) { int err; @@ -496,14 +627,24 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, pte_t *ptep; pte_t entry; int ret; + static DEFINE_MUTEX(hugetlb_instantiation_mutex); ptep = huge_pte_alloc(mm, address); if (!ptep) return VM_FAULT_OOM; + /* + * Serialize hugepage allocation and instantiation, so that we don't + * get spurious allocation failures if two CPUs race to instantiate + * the same page in the page cache. + */ + mutex_lock(&hugetlb_instantiation_mutex); entry = *ptep; - if (pte_none(entry)) - return hugetlb_no_page(mm, vma, address, ptep, write_access); + if (pte_none(entry)) { + ret = hugetlb_no_page(mm, vma, address, ptep, write_access); + mutex_unlock(&hugetlb_instantiation_mutex); + return ret; + } ret = VM_FAULT_MINOR; @@ -513,6 +654,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (write_access && !pte_write(entry)) ret = hugetlb_cow(mm, vma, address, ptep, entry); spin_unlock(&mm->page_table_lock); + mutex_unlock(&hugetlb_instantiation_mutex); return ret; } @@ -521,10 +663,10 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, struct page **pages, struct vm_area_struct **vmas, unsigned long *position, int *length, int i) { - unsigned long vpfn, vaddr = *position; + unsigned long pfn_offset; + unsigned long vaddr = *position; int remainder = *length; - vpfn = vaddr/PAGE_SIZE; spin_lock(&mm->page_table_lock); while (vaddr < vma->vm_end && remainder) { pte_t *pte; @@ -552,19 +694,28 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, break; } - if (pages) { - page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)]; - get_page(page); - pages[i] = page; - } + pfn_offset = (vaddr & ~HPAGE_MASK) >> PAGE_SHIFT; + page = pte_page(*pte); +same_page: + get_page(page); + if (pages) + pages[i] = page + pfn_offset; if (vmas) vmas[i] = vma; vaddr += PAGE_SIZE; - ++vpfn; + ++pfn_offset; --remainder; ++i; + if (vaddr < vma->vm_end && remainder && + pfn_offset < HPAGE_SIZE/PAGE_SIZE) { + /* + * We use pfn_offset to avoid touching the pageframes + * of this compound page. + */ + goto same_page; + } } spin_unlock(&mm->page_table_lock); *length = remainder; @@ -572,3 +723,32 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, return i; } + +void hugetlb_change_protection(struct vm_area_struct *vma, + unsigned long address, unsigned long end, pgprot_t newprot) +{ + struct mm_struct *mm = vma->vm_mm; + unsigned long start = address; + pte_t *ptep; + pte_t pte; + + BUG_ON(address >= end); + flush_cache_range(vma, address, end); + + spin_lock(&mm->page_table_lock); + for (; address < end; address += HPAGE_SIZE) { + ptep = huge_pte_offset(mm, address); + if (!ptep) + continue; + if (!pte_none(*ptep)) { + pte = huge_ptep_get_and_clear(mm, address, ptep); + pte = pte_mkhuge(pte_modify(pte, newprot)); + set_huge_pte_at(mm, address, ptep, pte); + lazy_mmu_prot_update(pte); + } + } + spin_unlock(&mm->page_table_lock); + + flush_tlb_range(vma, start, end); +} + diff --git a/mm/internal.h b/mm/internal.h index 17256bb2f4e..d20e3cc4aef 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -8,23 +8,33 @@ * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. */ +#ifndef __MM_INTERNAL_H +#define __MM_INTERNAL_H -static inline void set_page_refs(struct page *page, int order) +#include <linux/mm.h> + +static inline void set_page_count(struct page *page, int v) +{ + atomic_set(&page->_count, v); +} + +/* + * Turn a non-refcounted page (->_count == 0) into refcounted with + * a count of one. + */ +static inline void set_page_refcounted(struct page *page) { -#ifdef CONFIG_MMU + BUG_ON(PageCompound(page) && page_private(page) != (unsigned long)page); + BUG_ON(atomic_read(&page->_count)); set_page_count(page, 1); -#else - int i; +} - /* - * We need to reference all the pages for this order, otherwise if - * anyone accesses one of the pages with (get/put) it will be freed. - * - eg: access_process_vm() - */ - for (i = 0; i < (1 << order); i++) - set_page_count(page + i, 1); -#endif /* CONFIG_MMU */ +static inline void __put_page(struct page *page) +{ + atomic_dec(&page->_count); } extern void fastcall __init __free_pages_bootmem(struct page *page, unsigned int order); + +#endif diff --git a/mm/memory.c b/mm/memory.c index 85e80a57db2..80c3fb370f9 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -277,7 +277,7 @@ void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma, anon_vma_unlink(vma); unlink_file_vma(vma); - if (is_hugepage_only_range(vma->vm_mm, addr, HPAGE_SIZE)) { + if (is_vm_hugetlb_page(vma)) { hugetlb_free_pgd_range(tlb, addr, vma->vm_end, floor, next? next->vm_start: ceiling); } else { @@ -285,8 +285,7 @@ void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma, * Optimization: gather nearby vmas into one call down */ while (next && next->vm_start <= vma->vm_end + PMD_SIZE - && !is_hugepage_only_range(vma->vm_mm, next->vm_start, - HPAGE_SIZE)) { + && !is_vm_hugetlb_page(next)) { vma = next; next = vma->vm_next; anon_vma_unlink(vma); @@ -388,7 +387,7 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_ { unsigned long pfn = pte_pfn(pte); - if (vma->vm_flags & VM_PFNMAP) { + if (unlikely(vma->vm_flags & VM_PFNMAP)) { unsigned long off = (addr - vma->vm_start) >> PAGE_SHIFT; if (pfn == vma->vm_pgoff + off) return NULL; @@ -396,18 +395,12 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_ return NULL; } - /* - * Add some anal sanity checks for now. Eventually, - * we should just do "return pfn_to_page(pfn)", but - * in the meantime we check that we get a valid pfn, - * and that the resulting page looks ok. - * - * Remove this test eventually! - */ +#ifdef CONFIG_DEBUG_VM if (unlikely(!pfn_valid(pfn))) { print_bad_pte(vma, pte, addr); return NULL; } +#endif /* * NOTE! We still have PageReserved() pages in the page @@ -1221,9 +1214,7 @@ out: * The page has to be a nice clean _individual_ kernel allocation. * If you allocate a compound page, you need to have marked it as * such (__GFP_COMP), or manually just split the page up yourself - * (which is mainly an issue of doing "set_page_count(page, 1)" for - * each sub-page, and then freeing them one by one when you free - * them rather than freeing it as a compound page). + * (see split_page()). * * NOTE! Traditionally this was done with "remap_pfn_range()" which * took an arbitrary page protection parameter. This doesn't allow diff --git a/mm/mempolicy.c b/mm/mempolicy.c index b21869a39f0..e93cc740c22 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -86,6 +86,7 @@ #include <linux/swap.h> #include <linux/seq_file.h> #include <linux/proc_fs.h> +#include <linux/migrate.h> #include <asm/tlbflush.h> #include <asm/uaccess.h> @@ -95,11 +96,8 @@ #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */ #define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2) /* Gather statistics */ -/* The number of pages to migrate per call to migrate_pages() */ -#define MIGRATE_CHUNK_SIZE 256 - -static kmem_cache_t *policy_cache; -static kmem_cache_t *sn_cache; +static struct kmem_cache *policy_cache; +static struct kmem_cache *sn_cache; #define PDprintk(fmt...) @@ -331,17 +329,10 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end, struct vm_area_struct *first, *vma, *prev; if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { - /* Must have swap device for migration */ - if (nr_swap_pages <= 0) - return ERR_PTR(-ENODEV); - /* - * Clear the LRU lists so pages can be isolated. - * Note that pages may be moved off the LRU after we have - * drained them. Those pages will fail to migrate like other - * pages that may be busy. - */ - lru_add_drain_all(); + err = migrate_prep(); + if (err) + return ERR_PTR(err); } first = find_vma(mm, start); @@ -550,92 +541,18 @@ long do_get_mempolicy(int *policy, nodemask_t *nmask, return err; } +#ifdef CONFIG_MIGRATION /* * page migration */ - static void migrate_page_add(struct page *page, struct list_head *pagelist, unsigned long flags) { /* * Avoid migrating a page that is shared with others. */ - if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) { - if (isolate_lru_page(page)) - list_add_tail(&page->lru, pagelist); - } -} - -/* - * Migrate the list 'pagelist' of pages to a certain destination. - * - * Specify destination with either non-NULL vma or dest_node >= 0 - * Return the number of pages not migrated or error code - */ -static int migrate_pages_to(struct list_head *pagelist, - struct vm_area_struct *vma, int dest) -{ - LIST_HEAD(newlist); - LIST_HEAD(moved); - LIST_HEAD(failed); - int err = 0; - unsigned long offset = 0; - int nr_pages; - struct page *page; - struct list_head *p; - -redo: - nr_pages = 0; - list_for_each(p, pagelist) { - if (vma) { - /* - * The address passed to alloc_page_vma is used to - * generate the proper interleave behavior. We fake - * the address here by an increasing offset in order - * to get the proper distribution of pages. - * - * No decision has been made as to which page - * a certain old page is moved to so we cannot - * specify the correct address. - */ - page = alloc_page_vma(GFP_HIGHUSER, vma, - offset + vma->vm_start); - offset += PAGE_SIZE; - } - else - page = alloc_pages_node(dest, GFP_HIGHUSER, 0); - - if (!page) { - err = -ENOMEM; - goto out; - } - list_add_tail(&page->lru, &newlist); - nr_pages++; - if (nr_pages > MIGRATE_CHUNK_SIZE) - break; - } - err = migrate_pages(pagelist, &newlist, &moved, &failed); - - putback_lru_pages(&moved); /* Call release pages instead ?? */ - - if (err >= 0 && list_empty(&newlist) && !list_empty(pagelist)) - goto redo; -out: - /* Return leftover allocated pages */ - while (!list_empty(&newlist)) { - page = list_entry(newlist.next, struct page, lru); - list_del(&page->lru); - __free_page(page); - } - list_splice(&failed, pagelist); - if (err < 0) - return err; - - /* Calculate number of leftover pages */ - nr_pages = 0; - list_for_each(p, pagelist) - nr_pages++; - return nr_pages; + if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) + isolate_lru_page(page, pagelist); } /* @@ -742,8 +659,23 @@ int do_migrate_pages(struct mm_struct *mm, if (err < 0) return err; return busy; + } +#else + +static void migrate_page_add(struct page *page, struct list_head *pagelist, + unsigned long flags) +{ +} + +int do_migrate_pages(struct mm_struct *mm, + const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags) +{ + return -ENOSYS; +} +#endif + long do_mbind(unsigned long start, unsigned long len, unsigned long mode, nodemask_t *nmask, unsigned long flags) { @@ -808,6 +740,7 @@ long do_mbind(unsigned long start, unsigned long len, if (!err && nr_failed && (flags & MPOL_MF_STRICT)) err = -EIO; } + if (!list_empty(&pagelist)) putback_lru_pages(&pagelist); diff --git a/mm/mempool.c b/mm/mempool.c index 1a99b80480d..f71893ed354 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -278,14 +278,14 @@ EXPORT_SYMBOL(mempool_free); */ void *mempool_alloc_slab(gfp_t gfp_mask, void *pool_data) { - kmem_cache_t *mem = (kmem_cache_t *) pool_data; + struct kmem_cache *mem = pool_data; return kmem_cache_alloc(mem, gfp_mask); } EXPORT_SYMBOL(mempool_alloc_slab); void mempool_free_slab(void *element, void *pool_data) { - kmem_cache_t *mem = (kmem_cache_t *) pool_data; + struct kmem_cache *mem = pool_data; kmem_cache_free(mem, element); } EXPORT_SYMBOL(mempool_free_slab); diff --git a/mm/migrate.c b/mm/migrate.c new file mode 100644 index 00000000000..09f6e4aa87f --- /dev/null +++ b/mm/migrate.c @@ -0,0 +1,655 @@ +/* + * Memory Migration functionality - linux/mm/migration.c + * + * Copyright (C) 2006 Silicon Graphics, Inc., Christoph Lameter + * + * Page migration was first developed in the context of the memory hotplug + * project. The main authors of the migration code are: + * + * IWAMOTO Toshihiro <iwamoto@valinux.co.jp> + * Hirokazu Takahashi <taka@valinux.co.jp> + * Dave Hansen <haveblue@us.ibm.com> + * Christoph Lameter <clameter@sgi.com> + */ + +#include <linux/migrate.h> +#include <linux/module.h> +#include <linux/swap.h> +#include <linux/pagemap.h> +#include <linux/buffer_head.h> /* for try_to_release_page(), + buffer_heads_over_limit */ +#include <linux/mm_inline.h> +#include <linux/pagevec.h> +#include <linux/rmap.h> +#include <linux/topology.h> +#include <linux/cpu.h> +#include <linux/cpuset.h> +#include <linux/swapops.h> + +#include "internal.h" + +#include "internal.h" + +/* The maximum number of pages to take off the LRU for migration */ +#define MIGRATE_CHUNK_SIZE 256 + +#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) + +/* + * Isolate one page from the LRU lists. If successful put it onto + * the indicated list with elevated page count. + * + * Result: + * -EBUSY: page not on LRU list + * 0: page removed from LRU list and added to the specified list. + */ +int isolate_lru_page(struct page *page, struct list_head *pagelist) +{ + int ret = -EBUSY; + + if (PageLRU(page)) { + struct zone *zone = page_zone(page); + + spin_lock_irq(&zone->lru_lock); + if (PageLRU(page)) { + ret = 0; + get_page(page); + ClearPageLRU(page); + if (PageActive(page)) + del_page_from_active_list(zone, page); + else + del_page_from_inactive_list(zone, page); + list_add_tail(&page->lru, pagelist); + } + spin_unlock_irq(&zone->lru_lock); + } + return ret; +} + +/* + * migrate_prep() needs to be called after we have compiled the list of pages + * to be migrated using isolate_lru_page() but before we begin a series of calls + * to migrate_pages(). + */ +int migrate_prep(void) +{ + /* Must have swap device for migration */ + if (nr_swap_pages <= 0) + return -ENODEV; + + /* + * Clear the LRU lists so pages can be isolated. + * Note that pages may be moved off the LRU after we have + * drained them. Those pages will fail to migrate like other + * pages that may be busy. + */ + lru_add_drain_all(); + + return 0; +} + +static inline void move_to_lru(struct page *page) +{ + list_del(&page->lru); + if (PageActive(page)) { + /* + * lru_cache_add_active checks that + * the PG_active bit is off. + */ + ClearPageActive(page); + lru_cache_add_active(page); + } else { + lru_cache_add(page); + } + put_page(page); +} + +/* + * Add isolated pages on the list back to the LRU. + * + * returns the number of pages put back. + */ +int putback_lru_pages(struct list_head *l) +{ + struct page *page; + struct page *page2; + int count = 0; + + list_for_each_entry_safe(page, page2, l, lru) { + move_to_lru(page); + count++; + } + return count; +} + +/* + * Non migratable page + */ +int fail_migrate_page(struct page *newpage, struct page *page) +{ + return -EIO; +} +EXPORT_SYMBOL(fail_migrate_page); + +/* + * swapout a single page + * page is locked upon entry, unlocked on exit + */ +static int swap_page(struct page *page) +{ + struct address_space *mapping = page_mapping(page); + + if (page_mapped(page) && mapping) + if (try_to_unmap(page, 1) != SWAP_SUCCESS) + goto unlock_retry; + + if (PageDirty(page)) { + /* Page is dirty, try to write it out here */ + switch(pageout(page, mapping)) { + case PAGE_KEEP: + case PAGE_ACTIVATE: + goto unlock_retry; + + case PAGE_SUCCESS: + goto retry; + + case PAGE_CLEAN: + ; /* try to free the page below */ + } + } + + if (PagePrivate(page)) { + if (!try_to_release_page(page, GFP_KERNEL) || + (!mapping && page_count(page) == 1)) + goto unlock_retry; + } + + if (remove_mapping(mapping, page)) { + /* Success */ + unlock_page(page); + return 0; + } + +unlock_retry: + unlock_page(page); + +retry: + return -EAGAIN; +} +EXPORT_SYMBOL(swap_page); + +/* + * Remove references for a page and establish the new page with the correct + * basic settings to be able to stop accesses to the page. + */ +int migrate_page_remove_references(struct page *newpage, + struct page *page, int nr_refs) +{ + struct address_space *mapping = page_mapping(page); + struct page **radix_pointer; + + /* + * Avoid doing any of the following work if the page count + * indicates that the page is in use or truncate has removed + * the page. + */ + if (!mapping || page_mapcount(page) + nr_refs != page_count(page)) + return -EAGAIN; + + /* + * Establish swap ptes for anonymous pages or destroy pte + * maps for files. + * + * In order to reestablish file backed mappings the fault handlers + * will take the radix tree_lock which may then be used to stop + * processses from accessing this page until the new page is ready. + * + * A process accessing via a swap pte (an anonymous page) will take a + * page_lock on the old page which will block the process until the + * migration attempt is complete. At that time the PageSwapCache bit + * will be examined. If the page was migrated then the PageSwapCache + * bit will be clear and the operation to retrieve the page will be + * retried which will find the new page in the radix tree. Then a new + * direct mapping may be generated based on the radix tree contents. + * + * If the page was not migrated then the PageSwapCache bit + * is still set and the operation may continue. + */ + if (try_to_unmap(page, 1) == SWAP_FAIL) + /* A vma has VM_LOCKED set -> permanent failure */ + return -EPERM; + + /* + * Give up if we were unable to remove all mappings. + */ + if (page_mapcount(page)) + return -EAGAIN; + + write_lock_irq(&mapping->tree_lock); + + radix_pointer = (struct page **)radix_tree_lookup_slot( + &mapping->page_tree, + page_index(page)); + + if (!page_mapping(page) || page_count(page) != nr_refs || + *radix_pointer != page) { + write_unlock_irq(&mapping->tree_lock); + return 1; + } + + /* + * Now we know that no one else is looking at the page. + * + * Certain minimal information about a page must be available + * in order for other subsystems to properly handle the page if they + * find it through the radix tree update before we are finished + * copying the page. + */ + get_page(newpage); + newpage->index = page->index; + newpage->mapping = page->mapping; + if (PageSwapCache(page)) { + SetPageSwapCache(newpage); + set_page_private(newpage, page_private(page)); + } + + *radix_pointer = newpage; + __put_page(page); + write_unlock_irq(&mapping->tree_lock); + + return 0; +} +EXPORT_SYMBOL(migrate_page_remove_references); + +/* + * Copy the page to its new location + */ +void migrate_page_copy(struct page *newpage, struct page *page) +{ + copy_highpage(newpage, page); + + if (PageError(page)) + SetPageError(newpage); + if (PageReferenced(page)) + SetPageReferenced(newpage); + if (PageUptodate(page)) + SetPageUptodate(newpage); + if (PageActive(page)) + SetPageActive(newpage); + if (PageChecked(page)) + SetPageChecked(newpage); + if (PageMappedToDisk(page)) + SetPageMappedToDisk(newpage); + + if (PageDirty(page)) { + clear_page_dirty_for_io(page); + set_page_dirty(newpage); + } + + ClearPageSwapCache(page); + ClearPageActive(page); + ClearPagePrivate(page); + set_page_private(page, 0); + page->mapping = NULL; + + /* + * If any waiters have accumulated on the new page then + * wake them up. + */ + if (PageWriteback(newpage)) + end_page_writeback(newpage); +} +EXPORT_SYMBOL(migrate_page_copy); + +/* + * Common logic to directly migrate a single page suitable for + * pages that do not use PagePrivate. + * + * Pages are locked upon entry and exit. + */ +int migrate_page(struct page *newpage, struct page *page) +{ + int rc; + + BUG_ON(PageWriteback(page)); /* Writeback must be complete */ + + rc = migrate_page_remove_references(newpage, page, 2); + + if (rc) + return rc; + + migrate_page_copy(newpage, page); + + /* + * Remove auxiliary swap entries and replace + * them with real ptes. + * + * Note that a real pte entry will allow processes that are not + * waiting on the page lock to use the new page via the page tables + * before the new page is unlocked. + */ + remove_from_swap(newpage); + return 0; +} +EXPORT_SYMBOL(migrate_page); + +/* + * migrate_pages + * + * Two lists are passed to this function. The first list + * contains the pages isolated from the LRU to be migrated. + * The second list contains new pages that the pages isolated + * can be moved to. If the second list is NULL then all + * pages are swapped out. + * + * The function returns after 10 attempts or if no pages + * are movable anymore because to has become empty + * or no retryable pages exist anymore. + * + * Return: Number of pages not migrated when "to" ran empty. + */ +int migrate_pages(struct list_head *from, struct list_head *to, + struct list_head *moved, struct list_head *failed) +{ + int retry; + int nr_failed = 0; + int pass = 0; + struct page *page; + struct page *page2; + int swapwrite = current->flags & PF_SWAPWRITE; + int rc; + + if (!swapwrite) + current->flags |= PF_SWAPWRITE; + +redo: + retry = 0; + + list_for_each_entry_safe(page, page2, from, lru) { + struct page *newpage = NULL; + struct address_space *mapping; + + cond_resched(); + + rc = 0; + if (page_count(page) == 1) + /* page was freed from under us. So we are done. */ + goto next; + + if (to && list_empty(to)) + break; + + /* + * Skip locked pages during the first two passes to give the + * functions holding the lock time to release the page. Later we + * use lock_page() to have a higher chance of acquiring the + * lock. + */ + rc = -EAGAIN; + if (pass > 2) + lock_page(page); + else + if (TestSetPageLocked(page)) + goto next; + + /* + * Only wait on writeback if we have already done a pass where + * we we may have triggered writeouts for lots of pages. + */ + if (pass > 0) { + wait_on_page_writeback(page); + } else { + if (PageWriteback(page)) + goto unlock_page; + } + + /* + * Anonymous pages must have swap cache references otherwise + * the information contained in the page maps cannot be + * preserved. + */ + if (PageAnon(page) && !PageSwapCache(page)) { + if (!add_to_swap(page, GFP_KERNEL)) { + rc = -ENOMEM; + goto unlock_page; + } + } + + if (!to) { + rc = swap_page(page); + goto next; + } + + newpage = lru_to_page(to); + lock_page(newpage); + + /* + * Pages are properly locked and writeback is complete. + * Try to migrate the page. + */ + mapping = page_mapping(page); + if (!mapping) + goto unlock_both; + + if (mapping->a_ops->migratepage) { + /* + * Most pages have a mapping and most filesystems + * should provide a migration function. Anonymous + * pages are part of swap space which also has its + * own migration function. This is the most common + * path for page migration. + */ + rc = mapping->a_ops->migratepage(newpage, page); + goto unlock_both; + } + + /* + * Default handling if a filesystem does not provide + * a migration function. We can only migrate clean + * pages so try to write out any dirty pages first. + */ + if (PageDirty(page)) { + switch (pageout(page, mapping)) { + case PAGE_KEEP: + case PAGE_ACTIVATE: + goto unlock_both; + + case PAGE_SUCCESS: + unlock_page(newpage); + goto next; + + case PAGE_CLEAN: + ; /* try to migrate the page below */ + } + } + + /* + * Buffers are managed in a filesystem specific way. + * We must have no buffers or drop them. + */ + if (!page_has_buffers(page) || + try_to_release_page(page, GFP_KERNEL)) { + rc = migrate_page(newpage, page); + goto unlock_both; + } + + /* + * On early passes with mapped pages simply + * retry. There may be a lock held for some + * buffers that may go away. Later + * swap them out. + */ + if (pass > 4) { + /* + * Persistently unable to drop buffers..... As a + * measure of last resort we fall back to + * swap_page(). + */ + unlock_page(newpage); + newpage = NULL; + rc = swap_page(page); + goto next; + } + +unlock_both: + unlock_page(newpage); + +unlock_page: + unlock_page(page); + +next: + if (rc == -EAGAIN) { + retry++; + } else if (rc) { + /* Permanent failure */ + list_move(&page->lru, failed); + nr_failed++; + } else { + if (newpage) { + /* Successful migration. Return page to LRU */ + move_to_lru(newpage); + } + list_move(&page->lru, moved); + } + } + if (retry && pass++ < 10) + goto redo; + + if (!swapwrite) + current->flags &= ~PF_SWAPWRITE; + + return nr_failed + retry; +} + +/* + * Migration function for pages with buffers. This function can only be used + * if the underlying filesystem guarantees that no other references to "page" + * exist. + */ +int buffer_migrate_page(struct page *newpage, struct page *page) +{ + struct address_space *mapping = page->mapping; + struct buffer_head *bh, *head; + int rc; + + if (!mapping) + return -EAGAIN; + + if (!page_has_buffers(page)) + return migrate_page(newpage, page); + + head = page_buffers(page); + + rc = migrate_page_remove_references(newpage, page, 3); + + if (rc) + return rc; + + bh = head; + do { + get_bh(bh); + lock_buffer(bh); + bh = bh->b_this_page; + + } while (bh != head); + + ClearPagePrivate(page); + set_page_private(newpage, page_private(page)); + set_page_private(page, 0); + put_page(page); + get_page(newpage); + + bh = head; + do { + set_bh_page(bh, newpage, bh_offset(bh)); + bh = bh->b_this_page; + + } while (bh != head); + + SetPagePrivate(newpage); + + migrate_page_copy(newpage, page); + + bh = head; + do { + unlock_buffer(bh); + put_bh(bh); + bh = bh->b_this_page; + + } while (bh != head); + + return 0; +} +EXPORT_SYMBOL(buffer_migrate_page); + +/* + * Migrate the list 'pagelist' of pages to a certain destination. + * + * Specify destination with either non-NULL vma or dest_node >= 0 + * Return the number of pages not migrated or error code + */ +int migrate_pages_to(struct list_head *pagelist, + struct vm_area_struct *vma, int dest) +{ + LIST_HEAD(newlist); + LIST_HEAD(moved); + LIST_HEAD(failed); + int err = 0; + unsigned long offset = 0; + int nr_pages; + struct page *page; + struct list_head *p; + +redo: + nr_pages = 0; + list_for_each(p, pagelist) { + if (vma) { + /* + * The address passed to alloc_page_vma is used to + * generate the proper interleave behavior. We fake + * the address here by an increasing offset in order + * to get the proper distribution of pages. + * + * No decision has been made as to which page + * a certain old page is moved to so we cannot + * specify the correct address. + */ + page = alloc_page_vma(GFP_HIGHUSER, vma, + offset + vma->vm_start); + offset += PAGE_SIZE; + } + else + page = alloc_pages_node(dest, GFP_HIGHUSER, 0); + + if (!page) { + err = -ENOMEM; + goto out; + } + list_add_tail(&page->lru, &newlist); + nr_pages++; + if (nr_pages > MIGRATE_CHUNK_SIZE) + break; + } + err = migrate_pages(pagelist, &newlist, &moved, &failed); + + putback_lru_pages(&moved); /* Call release pages instead ?? */ + + if (err >= 0 && list_empty(&newlist) && !list_empty(pagelist)) + goto redo; +out: + /* Return leftover allocated pages */ + while (!list_empty(&newlist)) { + page = list_entry(newlist.next, struct page, lru); + list_del(&page->lru); + __free_page(page); + } + list_splice(&failed, pagelist); + if (err < 0) + return err; + + /* Calculate number of leftover pages */ + nr_pages = 0; + list_for_each(p, pagelist) + nr_pages++; + return nr_pages; +} diff --git a/mm/mmap.c b/mm/mmap.c index 47556d2b3e9..0eb9894db6d 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -612,7 +612,7 @@ again: remove_next = 1 + (end > next->vm_end); * If the vma has a ->close operation then the driver probably needs to release * per-vma resources, so we don't attempt to merge those. */ -#define VM_SPECIAL (VM_IO | VM_DONTCOPY | VM_DONTEXPAND | VM_RESERVED | VM_PFNMAP) +#define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_RESERVED | VM_PFNMAP) static inline int is_mergeable_vma(struct vm_area_struct *vma, struct file *file, unsigned long vm_flags) @@ -845,14 +845,6 @@ void vm_stat_account(struct mm_struct *mm, unsigned long flags, const unsigned long stack_flags = VM_STACK_FLAGS & (VM_GROWSUP|VM_GROWSDOWN); -#ifdef CONFIG_HUGETLB - if (flags & VM_HUGETLB) { - if (!(flags & VM_DONTCOPY)) - mm->shared_vm += pages; - return; - } -#endif /* CONFIG_HUGETLB */ - if (file) { mm->shared_vm += pages; if ((flags & (VM_EXEC|VM_WRITE)) == VM_EXEC) diff --git a/mm/mprotect.c b/mm/mprotect.c index 653b8571c1e..4c14d4289b6 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -124,7 +124,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, * a MAP_NORESERVE private mapping to writable will now reserve. */ if (newflags & VM_WRITE) { - if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_SHARED|VM_HUGETLB))) { + if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_SHARED))) { charged = nrpages; if (security_vm_enough_memory(charged)) return -ENOMEM; @@ -166,7 +166,10 @@ success: */ vma->vm_flags = newflags; vma->vm_page_prot = newprot; - change_protection(vma, start, end, newprot); + if (is_vm_hugetlb_page(vma)) + hugetlb_change_protection(vma, start, end, newprot); + else + change_protection(vma, start, end, newprot); vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); vm_stat_account(mm, newflags, vma->vm_file, nrpages); return 0; @@ -240,11 +243,6 @@ sys_mprotect(unsigned long start, size_t len, unsigned long prot) /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ - if (is_vm_hugetlb_page(vma)) { - error = -EACCES; - goto out; - } - newflags = vm_flags | (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC)); /* newflags >> 4 shift VM_MAY% in place of VM_% */ diff --git a/mm/nommu.c b/mm/nommu.c index 4951f4786f2..db45efac17c 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -159,7 +159,7 @@ void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) /* * kmalloc doesn't like __GFP_HIGHMEM for some reason */ - return kmalloc(size, gfp_mask & ~__GFP_HIGHMEM); + return kmalloc(size, (gfp_mask | __GFP_COMP) & ~__GFP_HIGHMEM); } struct page * vmalloc_to_page(void *addr) @@ -623,7 +623,7 @@ static int do_mmap_private(struct vm_area_struct *vma, unsigned long len) * - note that this may not return a page-aligned address if the object * we're allocating is smaller than a page */ - base = kmalloc(len, GFP_KERNEL); + base = kmalloc(len, GFP_KERNEL|__GFP_COMP); if (!base) goto enomem; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 234bd4895d1..b7f14a4799a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -55,7 +55,6 @@ unsigned long totalhigh_pages __read_mostly; long nr_swap_pages; int percpu_pagelist_fraction; -static void fastcall free_hot_cold_page(struct page *page, int cold); static void __free_pages_ok(struct page *page, unsigned int order); /* @@ -190,7 +189,7 @@ static void prep_compound_page(struct page *page, unsigned long order) for (i = 0; i < nr_pages; i++) { struct page *p = page + i; - SetPageCompound(p); + __SetPageCompound(p); set_page_private(p, (unsigned long)page); } } @@ -209,10 +208,24 @@ static void destroy_compound_page(struct page *page, unsigned long order) if (unlikely(!PageCompound(p) | (page_private(p) != (unsigned long)page))) bad_page(page); - ClearPageCompound(p); + __ClearPageCompound(p); } } +static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags) +{ + int i; + + BUG_ON((gfp_flags & (__GFP_WAIT | __GFP_HIGHMEM)) == __GFP_HIGHMEM); + /* + * clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO + * and __GFP_HIGHMEM from hard or soft interrupt context. + */ + BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt()); + for (i = 0; i < (1 << order); i++) + clear_highpage(page + i); +} + /* * function for dealing with page's order in buddy system. * zone->lock is already acquired when we use these. @@ -423,11 +436,6 @@ static void __free_pages_ok(struct page *page, unsigned int order) mutex_debug_check_no_locks_freed(page_address(page), PAGE_SIZE<<order); -#ifndef CONFIG_MMU - for (i = 1 ; i < (1 << order) ; ++i) - __put_page(page + i); -#endif - for (i = 0 ; i < (1 << order) ; ++i) reserved += free_pages_check(page + i); if (reserved) @@ -448,28 +456,23 @@ void fastcall __init __free_pages_bootmem(struct page *page, unsigned int order) if (order == 0) { __ClearPageReserved(page); set_page_count(page, 0); - - free_hot_cold_page(page, 0); + set_page_refcounted(page); + __free_page(page); } else { - LIST_HEAD(list); int loop; + prefetchw(page); for (loop = 0; loop < BITS_PER_LONG; loop++) { struct page *p = &page[loop]; - if (loop + 16 < BITS_PER_LONG) - prefetchw(p + 16); + if (loop + 1 < BITS_PER_LONG) + prefetchw(p + 1); __ClearPageReserved(p); set_page_count(p, 0); } - arch_free_page(page, order); - - mod_page_state(pgfree, 1 << order); - - list_add(&page->lru, &list); - kernel_map_pages(page, 1 << order, 0); - free_pages_bulk(page_zone(page), 1, &list, order); + set_page_refcounted(page); + __free_pages(page, order); } } @@ -507,7 +510,7 @@ static inline void expand(struct zone *zone, struct page *page, /* * This page is about to be returned from the page allocator */ -static int prep_new_page(struct page *page, int order) +static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) { if (unlikely(page_mapcount(page) | (page->mapping != NULL) | @@ -536,8 +539,15 @@ static int prep_new_page(struct page *page, int order) 1 << PG_referenced | 1 << PG_arch_1 | 1 << PG_checked | 1 << PG_mappedtodisk); set_page_private(page, 0); - set_page_refs(page, order); + set_page_refcounted(page); kernel_map_pages(page, 1 << order, 1); + + if (gfp_flags & __GFP_ZERO) + prep_zero_page(page, order, gfp_flags); + + if (order && (gfp_flags & __GFP_COMP)) + prep_compound_page(page, order); + return 0; } @@ -593,13 +603,14 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, /* * Called from the slab reaper to drain pagesets on a particular node that * belong to the currently executing processor. + * Note that this function must be called with the thread pinned to + * a single processor. */ void drain_node_pages(int nodeid) { int i, z; unsigned long flags; - local_irq_save(flags); for (z = 0; z < MAX_NR_ZONES; z++) { struct zone *zone = NODE_DATA(nodeid)->node_zones + z; struct per_cpu_pageset *pset; @@ -609,11 +620,14 @@ void drain_node_pages(int nodeid) struct per_cpu_pages *pcp; pcp = &pset->pcp[i]; - free_pages_bulk(zone, pcp->count, &pcp->list, 0); - pcp->count = 0; + if (pcp->count) { + local_irq_save(flags); + free_pages_bulk(zone, pcp->count, &pcp->list, 0); + pcp->count = 0; + local_irq_restore(flags); + } } } - local_irq_restore(flags); } #endif @@ -743,13 +757,22 @@ void fastcall free_cold_page(struct page *page) free_hot_cold_page(page, 1); } -static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags) +/* + * split_page takes a non-compound higher-order page, and splits it into + * n (1<<order) sub-pages: page[0..n] + * Each sub-page must be freed individually. + * + * Note: this is probably too low level an operation for use in drivers. + * Please consult with lkml before using this in your driver. + */ +void split_page(struct page *page, unsigned int order) { int i; - BUG_ON((gfp_flags & (__GFP_WAIT | __GFP_HIGHMEM)) == __GFP_HIGHMEM); - for(i = 0; i < (1 << order); i++) - clear_highpage(page + i); + BUG_ON(PageCompound(page)); + BUG_ON(!page_count(page)); + for (i = 1; i < (1 << order); i++) + set_page_refcounted(page + i); } /* @@ -795,14 +818,8 @@ again: put_cpu(); BUG_ON(bad_range(zone, page)); - if (prep_new_page(page, order)) + if (prep_new_page(page, order, gfp_flags)) goto again; - - if (gfp_flags & __GFP_ZERO) - prep_zero_page(page, order, gfp_flags); - - if (order && (gfp_flags & __GFP_COMP)) - prep_compound_page(page, order); return page; failed: @@ -1214,24 +1231,22 @@ DEFINE_PER_CPU(long, nr_pagecache_local) = 0; static void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask) { - int cpu = 0; + unsigned cpu; memset(ret, 0, nr * sizeof(unsigned long)); cpus_and(*cpumask, *cpumask, cpu_online_map); - cpu = first_cpu(*cpumask); - while (cpu < NR_CPUS) { - unsigned long *in, *out, off; - - if (!cpu_isset(cpu, *cpumask)) - continue; + for_each_cpu_mask(cpu, *cpumask) { + unsigned long *in; + unsigned long *out; + unsigned off; + unsigned next_cpu; in = (unsigned long *)&per_cpu(page_states, cpu); - cpu = next_cpu(cpu, *cpumask); - - if (likely(cpu < NR_CPUS)) - prefetch(&per_cpu(page_states, cpu)); + next_cpu = next_cpu(cpu, *cpumask); + if (likely(next_cpu < NR_CPUS)) + prefetch(&per_cpu(page_states, next_cpu)); out = (unsigned long *)ret; for (off = 0; off < nr; off++) @@ -1764,7 +1779,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, continue; page = pfn_to_page(pfn); set_page_links(page, zone, nid, pfn); - set_page_count(page, 1); + init_page_count(page); reset_page_mapcount(page); SetPageReserved(page); INIT_LIST_HEAD(&page->lru); diff --git a/mm/readahead.c b/mm/readahead.c index 8d6eeaaa629..301b36c4a0c 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -52,13 +52,24 @@ static inline unsigned long get_min_readahead(struct file_ra_state *ra) return (VM_MIN_READAHEAD * 1024) / PAGE_CACHE_SIZE; } +static inline void reset_ahead_window(struct file_ra_state *ra) +{ + /* + * ... but preserve ahead_start + ahead_size value, + * see 'recheck:' label in page_cache_readahead(). + * Note: We never use ->ahead_size as rvalue without + * checking ->ahead_start != 0 first. + */ + ra->ahead_size += ra->ahead_start; + ra->ahead_start = 0; +} + static inline void ra_off(struct file_ra_state *ra) { ra->start = 0; ra->flags = 0; ra->size = 0; - ra->ahead_start = 0; - ra->ahead_size = 0; + reset_ahead_window(ra); return; } @@ -72,10 +83,10 @@ static unsigned long get_init_ra_size(unsigned long size, unsigned long max) { unsigned long newsize = roundup_pow_of_two(size); - if (newsize <= max / 64) - newsize = newsize * newsize; + if (newsize <= max / 32) + newsize = newsize * 4; else if (newsize <= max / 4) - newsize = max / 4; + newsize = newsize * 2; else newsize = max; return newsize; @@ -426,8 +437,7 @@ static int make_ahead_window(struct address_space *mapping, struct file *filp, * congestion. The ahead window will any way be closed * in case we failed due to excessive page cache hits. */ - ra->ahead_start = 0; - ra->ahead_size = 0; + reset_ahead_window(ra); } return ret; @@ -520,11 +530,11 @@ page_cache_readahead(struct address_space *mapping, struct file_ra_state *ra, * If we get here we are doing sequential IO and this was not the first * occurence (ie we have an existing window) */ - if (ra->ahead_start == 0) { /* no ahead window yet */ if (!make_ahead_window(mapping, filp, ra, 0)) - goto out; + goto recheck; } + /* * Already have an ahead window, check if we crossed into it. * If so, shift windows and issue a new ahead window. @@ -536,6 +546,10 @@ page_cache_readahead(struct address_space *mapping, struct file_ra_state *ra, ra->start = ra->ahead_start; ra->size = ra->ahead_size; make_ahead_window(mapping, filp, ra, 0); +recheck: + /* prev_page shouldn't overrun the ahead window */ + ra->prev_page = min(ra->prev_page, + ra->ahead_start + ra->ahead_size - 1); } out: diff --git a/mm/rmap.c b/mm/rmap.c index 67f0e20b101..1963e269314 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -56,13 +56,11 @@ #include <asm/tlbflush.h> -//#define RMAP_DEBUG /* can be enabled only for debugging */ - -kmem_cache_t *anon_vma_cachep; +struct kmem_cache *anon_vma_cachep; static inline void validate_anon_vma(struct vm_area_struct *find_vma) { -#ifdef RMAP_DEBUG +#ifdef CONFIG_DEBUG_VM struct anon_vma *anon_vma = find_vma->anon_vma; struct vm_area_struct *vma; unsigned int mapcount = 0; @@ -166,7 +164,8 @@ void anon_vma_unlink(struct vm_area_struct *vma) anon_vma_free(anon_vma); } -static void anon_vma_ctor(void *data, kmem_cache_t *cachep, unsigned long flags) +static void anon_vma_ctor(void *data, struct kmem_cache *cachep, + unsigned long flags) { if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == SLAB_CTOR_CONSTRUCTOR) { @@ -550,13 +549,14 @@ void page_add_file_rmap(struct page *page) void page_remove_rmap(struct page *page) { if (atomic_add_negative(-1, &page->_mapcount)) { - if (page_mapcount(page) < 0) { +#ifdef CONFIG_DEBUG_VM + if (unlikely(page_mapcount(page) < 0)) { printk (KERN_EMERG "Eeek! page_mapcount(page) went negative! (%d)\n", page_mapcount(page)); printk (KERN_EMERG " page->flags = %lx\n", page->flags); printk (KERN_EMERG " page->count = %x\n", page_count(page)); printk (KERN_EMERG " page->mapping = %p\n", page->mapping); } - +#endif BUG_ON(page_mapcount(page) < 0); /* * It would be tidy to reset the PageAnon mapping here, diff --git a/mm/shmem.c b/mm/shmem.c index 7c455fbaff7..37eaf42ed2c 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -875,7 +875,7 @@ redirty: } #ifdef CONFIG_NUMA -static int shmem_parse_mpol(char *value, int *policy, nodemask_t *policy_nodes) +static inline int shmem_parse_mpol(char *value, int *policy, nodemask_t *policy_nodes) { char *nodelist = strchr(value, ':'); int err = 1; @@ -2119,7 +2119,7 @@ failed: return err; } -static kmem_cache_t *shmem_inode_cachep; +static struct kmem_cache *shmem_inode_cachep; static struct inode *shmem_alloc_inode(struct super_block *sb) { @@ -2139,7 +2139,8 @@ static void shmem_destroy_inode(struct inode *inode) kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode)); } -static void init_once(void *foo, kmem_cache_t *cachep, unsigned long flags) +static void init_once(void *foo, struct kmem_cache *cachep, + unsigned long flags) { struct shmem_inode_info *p = (struct shmem_inode_info *) foo; diff --git a/mm/slab.c b/mm/slab.c index d0bd7f07ab0..1c8f5ee230d 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -50,7 +50,7 @@ * The head array is strictly LIFO and should improve the cache hit rates. * On SMP, it additionally reduces the spinlock operations. * - * The c_cpuarray may not be read with enabled local interrupts - + * The c_cpuarray may not be read with enabled local interrupts - * it's changed with a smp_call_function(). * * SMP synchronization: @@ -170,12 +170,12 @@ #if DEBUG # define CREATE_MASK (SLAB_DEBUG_INITIAL | SLAB_RED_ZONE | \ SLAB_POISON | SLAB_HWCACHE_ALIGN | \ - SLAB_NO_REAP | SLAB_CACHE_DMA | \ + SLAB_CACHE_DMA | \ SLAB_MUST_HWCACHE_ALIGN | SLAB_STORE_USER | \ SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ SLAB_DESTROY_BY_RCU) #else -# define CREATE_MASK (SLAB_HWCACHE_ALIGN | SLAB_NO_REAP | \ +# define CREATE_MASK (SLAB_HWCACHE_ALIGN | \ SLAB_CACHE_DMA | SLAB_MUST_HWCACHE_ALIGN | \ SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ SLAB_DESTROY_BY_RCU) @@ -266,16 +266,17 @@ struct array_cache { unsigned int batchcount; unsigned int touched; spinlock_t lock; - void *entry[0]; /* - * Must have this definition in here for the proper - * alignment of array_cache. Also simplifies accessing - * the entries. - * [0] is for gcc 2.95. It should really be []. - */ + void *entry[0]; /* + * Must have this definition in here for the proper + * alignment of array_cache. Also simplifies accessing + * the entries. + * [0] is for gcc 2.95. It should really be []. + */ }; -/* bootstrap: The caches do not work without cpuarrays anymore, - * but the cpuarrays are allocated from the generic caches... +/* + * bootstrap: The caches do not work without cpuarrays anymore, but the + * cpuarrays are allocated from the generic caches... */ #define BOOT_CPUCACHE_ENTRIES 1 struct arraycache_init { @@ -291,13 +292,13 @@ struct kmem_list3 { struct list_head slabs_full; struct list_head slabs_free; unsigned long free_objects; - unsigned long next_reap; - int free_touched; unsigned int free_limit; unsigned int colour_next; /* Per-node cache coloring */ spinlock_t list_lock; struct array_cache *shared; /* shared per node */ struct array_cache **alien; /* on other nodes */ + unsigned long next_reap; /* updated without locking */ + int free_touched; /* updated without locking */ }; /* @@ -310,10 +311,8 @@ struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS]; #define SIZE_L3 (1 + MAX_NUMNODES) /* - * This function must be completely optimized away if - * a constant is passed to it. Mostly the same as - * what is in linux/slab.h except it returns an - * index. + * This function must be completely optimized away if a constant is passed to + * it. Mostly the same as what is in linux/slab.h except it returns an index. */ static __always_inline int index_of(const size_t size) { @@ -351,14 +350,14 @@ static void kmem_list3_init(struct kmem_list3 *parent) parent->free_touched = 0; } -#define MAKE_LIST(cachep, listp, slab, nodeid) \ - do { \ - INIT_LIST_HEAD(listp); \ - list_splice(&(cachep->nodelists[nodeid]->slab), listp); \ +#define MAKE_LIST(cachep, listp, slab, nodeid) \ + do { \ + INIT_LIST_HEAD(listp); \ + list_splice(&(cachep->nodelists[nodeid]->slab), listp); \ } while (0) -#define MAKE_ALL_LISTS(cachep, ptr, nodeid) \ - do { \ +#define MAKE_ALL_LISTS(cachep, ptr, nodeid) \ + do { \ MAKE_LIST((cachep), (&(ptr)->slabs_full), slabs_full, nodeid); \ MAKE_LIST((cachep), (&(ptr)->slabs_partial), slabs_partial, nodeid); \ MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \ @@ -373,28 +372,30 @@ static void kmem_list3_init(struct kmem_list3 *parent) struct kmem_cache { /* 1) per-cpu data, touched during every alloc/free */ struct array_cache *array[NR_CPUS]; +/* 2) Cache tunables. Protected by cache_chain_mutex */ unsigned int batchcount; unsigned int limit; unsigned int shared; + unsigned int buffer_size; -/* 2) touched by every alloc & free from the backend */ +/* 3) touched by every alloc & free from the backend */ struct kmem_list3 *nodelists[MAX_NUMNODES]; - unsigned int flags; /* constant flags */ - unsigned int num; /* # of objs per slab */ - spinlock_t spinlock; -/* 3) cache_grow/shrink */ + unsigned int flags; /* constant flags */ + unsigned int num; /* # of objs per slab */ + +/* 4) cache_grow/shrink */ /* order of pgs per slab (2^n) */ unsigned int gfporder; /* force GFP flags, e.g. GFP_DMA */ gfp_t gfpflags; - size_t colour; /* cache colouring range */ + size_t colour; /* cache colouring range */ unsigned int colour_off; /* colour offset */ struct kmem_cache *slabp_cache; unsigned int slab_size; - unsigned int dflags; /* dynamic flags */ + unsigned int dflags; /* dynamic flags */ /* constructor func */ void (*ctor) (void *, struct kmem_cache *, unsigned long); @@ -402,11 +403,11 @@ struct kmem_cache { /* de-constructor func */ void (*dtor) (void *, struct kmem_cache *, unsigned long); -/* 4) cache creation/removal */ +/* 5) cache creation/removal */ const char *name; struct list_head next; -/* 5) statistics */ +/* 6) statistics */ #if STATS unsigned long num_active; unsigned long num_allocations; @@ -438,8 +439,9 @@ struct kmem_cache { #define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB) #define BATCHREFILL_LIMIT 16 -/* Optimization question: fewer reaps means less - * probability for unnessary cpucache drain/refill cycles. +/* + * Optimization question: fewer reaps means less probability for unnessary + * cpucache drain/refill cycles. * * OTOH the cpuarrays can contain lots of objects, * which could lock up otherwise freeable slabs. @@ -453,17 +455,19 @@ struct kmem_cache { #define STATS_INC_ALLOCED(x) ((x)->num_allocations++) #define STATS_INC_GROWN(x) ((x)->grown++) #define STATS_INC_REAPED(x) ((x)->reaped++) -#define STATS_SET_HIGH(x) do { if ((x)->num_active > (x)->high_mark) \ - (x)->high_mark = (x)->num_active; \ - } while (0) +#define STATS_SET_HIGH(x) \ + do { \ + if ((x)->num_active > (x)->high_mark) \ + (x)->high_mark = (x)->num_active; \ + } while (0) #define STATS_INC_ERR(x) ((x)->errors++) #define STATS_INC_NODEALLOCS(x) ((x)->node_allocs++) #define STATS_INC_NODEFREES(x) ((x)->node_frees++) -#define STATS_SET_FREEABLE(x, i) \ - do { if ((x)->max_freeable < i) \ - (x)->max_freeable = i; \ - } while (0) - +#define STATS_SET_FREEABLE(x, i) \ + do { \ + if ((x)->max_freeable < i) \ + (x)->max_freeable = i; \ + } while (0) #define STATS_INC_ALLOCHIT(x) atomic_inc(&(x)->allochit) #define STATS_INC_ALLOCMISS(x) atomic_inc(&(x)->allocmiss) #define STATS_INC_FREEHIT(x) atomic_inc(&(x)->freehit) @@ -478,9 +482,7 @@ struct kmem_cache { #define STATS_INC_ERR(x) do { } while (0) #define STATS_INC_NODEALLOCS(x) do { } while (0) #define STATS_INC_NODEFREES(x) do { } while (0) -#define STATS_SET_FREEABLE(x, i) \ - do { } while (0) - +#define STATS_SET_FREEABLE(x, i) do { } while (0) #define STATS_INC_ALLOCHIT(x) do { } while (0) #define STATS_INC_ALLOCMISS(x) do { } while (0) #define STATS_INC_FREEHIT(x) do { } while (0) @@ -488,7 +490,8 @@ struct kmem_cache { #endif #if DEBUG -/* Magic nums for obj red zoning. +/* + * Magic nums for obj red zoning. * Placed in the first word before and the first word after an obj. */ #define RED_INACTIVE 0x5A2CF071UL /* when obj is inactive */ @@ -499,7 +502,8 @@ struct kmem_cache { #define POISON_FREE 0x6b /* for use-after-free poisoning */ #define POISON_END 0xa5 /* end-byte of poisoning */ -/* memory layout of objects: +/* + * memory layout of objects: * 0 : objp * 0 .. cachep->obj_offset - BYTES_PER_WORD - 1: padding. This ensures that * the end of an object is aligned with the end of the real @@ -508,7 +512,8 @@ struct kmem_cache { * redzone word. * cachep->obj_offset: The real object. * cachep->buffer_size - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long] - * cachep->buffer_size - 1* BYTES_PER_WORD: last caller address [BYTES_PER_WORD long] + * cachep->buffer_size - 1* BYTES_PER_WORD: last caller address + * [BYTES_PER_WORD long] */ static int obj_offset(struct kmem_cache *cachep) { @@ -552,8 +557,8 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp) #endif /* - * Maximum size of an obj (in 2^order pages) - * and absolute limit for the gfp order. + * Maximum size of an obj (in 2^order pages) and absolute limit for the gfp + * order. */ #if defined(CONFIG_LARGE_ALLOCS) #define MAX_OBJ_ORDER 13 /* up to 32Mb */ @@ -573,9 +578,10 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp) #define BREAK_GFP_ORDER_LO 0 static int slab_break_gfp_order = BREAK_GFP_ORDER_LO; -/* Functions for storing/retrieving the cachep and or slab from the - * global 'mem_map'. These are used to find the slab an obj belongs to. - * With kfree(), these are used to find the cache which an obj belongs to. +/* + * Functions for storing/retrieving the cachep and or slab from the page + * allocator. These are used to find the slab an obj belongs to. With kfree(), + * these are used to find the cache which an obj belongs to. */ static inline void page_set_cache(struct page *page, struct kmem_cache *cache) { @@ -584,6 +590,8 @@ static inline void page_set_cache(struct page *page, struct kmem_cache *cache) static inline struct kmem_cache *page_get_cache(struct page *page) { + if (unlikely(PageCompound(page))) + page = (struct page *)page_private(page); return (struct kmem_cache *)page->lru.next; } @@ -594,6 +602,8 @@ static inline void page_set_slab(struct page *page, struct slab *slab) static inline struct slab *page_get_slab(struct page *page) { + if (unlikely(PageCompound(page))) + page = (struct page *)page_private(page); return (struct slab *)page->lru.prev; } @@ -609,7 +619,21 @@ static inline struct slab *virt_to_slab(const void *obj) return page_get_slab(page); } -/* These are the default caches for kmalloc. Custom caches can have other sizes. */ +static inline void *index_to_obj(struct kmem_cache *cache, struct slab *slab, + unsigned int idx) +{ + return slab->s_mem + cache->buffer_size * idx; +} + +static inline unsigned int obj_to_index(struct kmem_cache *cache, + struct slab *slab, void *obj) +{ + return (unsigned)(obj - slab->s_mem) / cache->buffer_size; +} + +/* + * These are the default caches for kmalloc. Custom caches can have other sizes. + */ struct cache_sizes malloc_sizes[] = { #define CACHE(x) { .cs_size = (x) }, #include <linux/kmalloc_sizes.h> @@ -642,8 +666,6 @@ static struct kmem_cache cache_cache = { .limit = BOOT_CPUCACHE_ENTRIES, .shared = 1, .buffer_size = sizeof(struct kmem_cache), - .flags = SLAB_NO_REAP, - .spinlock = SPIN_LOCK_UNLOCKED, .name = "kmem_cache", #if DEBUG .obj_size = sizeof(struct kmem_cache), @@ -655,8 +677,8 @@ static DEFINE_MUTEX(cache_chain_mutex); static struct list_head cache_chain; /* - * vm_enough_memory() looks at this to determine how many - * slab-allocated pages are possibly freeable under pressure + * vm_enough_memory() looks at this to determine how many slab-allocated pages + * are possibly freeable under pressure * * SLAB_RECLAIM_ACCOUNT turns this on per-slab */ @@ -675,7 +697,8 @@ static enum { static DEFINE_PER_CPU(struct work_struct, reap_work); -static void free_block(struct kmem_cache *cachep, void **objpp, int len, int node); +static void free_block(struct kmem_cache *cachep, void **objpp, int len, + int node); static void enable_cpucache(struct kmem_cache *cachep); static void cache_reap(void *unused); static int __node_shrink(struct kmem_cache *cachep, int node); @@ -685,7 +708,8 @@ static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) return cachep->array[smp_processor_id()]; } -static inline struct kmem_cache *__find_general_cachep(size_t size, gfp_t gfpflags) +static inline struct kmem_cache *__find_general_cachep(size_t size, + gfp_t gfpflags) { struct cache_sizes *csizep = malloc_sizes; @@ -720,8 +744,9 @@ static size_t slab_mgmt_size(size_t nr_objs, size_t align) return ALIGN(sizeof(struct slab)+nr_objs*sizeof(kmem_bufctl_t), align); } -/* Calculate the number of objects and left-over bytes for a given - buffer size. */ +/* + * Calculate the number of objects and left-over bytes for a given buffer size. + */ static void cache_estimate(unsigned long gfporder, size_t buffer_size, size_t align, int flags, size_t *left_over, unsigned int *num) @@ -782,7 +807,8 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size, #define slab_error(cachep, msg) __slab_error(__FUNCTION__, cachep, msg) -static void __slab_error(const char *function, struct kmem_cache *cachep, char *msg) +static void __slab_error(const char *function, struct kmem_cache *cachep, + char *msg) { printk(KERN_ERR "slab error in %s(): cache `%s': %s\n", function, cachep->name, msg); @@ -804,7 +830,7 @@ static void init_reap_node(int cpu) node = next_node(cpu_to_node(cpu), node_online_map); if (node == MAX_NUMNODES) - node = 0; + node = first_node(node_online_map); __get_cpu_var(reap_node) = node; } @@ -906,10 +932,8 @@ static void free_alien_cache(struct array_cache **ac_ptr) if (!ac_ptr) return; - for_each_node(i) kfree(ac_ptr[i]); - kfree(ac_ptr); } @@ -943,7 +967,8 @@ static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3) } } -static void drain_alien_cache(struct kmem_cache *cachep, struct array_cache **alien) +static void drain_alien_cache(struct kmem_cache *cachep, + struct array_cache **alien) { int i = 0; struct array_cache *ac; @@ -986,20 +1011,22 @@ static int __devinit cpuup_callback(struct notifier_block *nfb, switch (action) { case CPU_UP_PREPARE: mutex_lock(&cache_chain_mutex); - /* we need to do this right in the beginning since + /* + * We need to do this right in the beginning since * alloc_arraycache's are going to use this list. * kmalloc_node allows us to add the slab to the right * kmem_list3 and not this cpu's kmem_list3 */ list_for_each_entry(cachep, &cache_chain, next) { - /* setup the size64 kmemlist for cpu before we can + /* + * Set up the size64 kmemlist for cpu before we can * begin anything. Make sure some other cpu on this * node has not already allocated this */ if (!cachep->nodelists[node]) { - if (!(l3 = kmalloc_node(memsize, - GFP_KERNEL, node))) + l3 = kmalloc_node(memsize, GFP_KERNEL, node); + if (!l3) goto bad; kmem_list3_init(l3); l3->next_reap = jiffies + REAPTIMEOUT_LIST3 + @@ -1015,13 +1042,15 @@ static int __devinit cpuup_callback(struct notifier_block *nfb, spin_lock_irq(&cachep->nodelists[node]->list_lock); cachep->nodelists[node]->free_limit = - (1 + nr_cpus_node(node)) * - cachep->batchcount + cachep->num; + (1 + nr_cpus_node(node)) * + cachep->batchcount + cachep->num; spin_unlock_irq(&cachep->nodelists[node]->list_lock); } - /* Now we can go ahead with allocating the shared array's - & array cache's */ + /* + * Now we can go ahead with allocating the shared arrays and + * array caches + */ list_for_each_entry(cachep, &cache_chain, next) { struct array_cache *nc; struct array_cache *shared; @@ -1041,7 +1070,6 @@ static int __devinit cpuup_callback(struct notifier_block *nfb, if (!alien) goto bad; cachep->array[cpu] = nc; - l3 = cachep->nodelists[node]; BUG_ON(!l3); @@ -1061,7 +1089,6 @@ static int __devinit cpuup_callback(struct notifier_block *nfb, } #endif spin_unlock_irq(&l3->list_lock); - kfree(shared); free_alien_cache(alien); } @@ -1083,7 +1110,6 @@ static int __devinit cpuup_callback(struct notifier_block *nfb, /* fall thru */ case CPU_UP_CANCELED: mutex_lock(&cache_chain_mutex); - list_for_each_entry(cachep, &cache_chain, next) { struct array_cache *nc; struct array_cache *shared; @@ -1150,7 +1176,7 @@ free_array_cache: #endif } return NOTIFY_OK; - bad: +bad: mutex_unlock(&cache_chain_mutex); return NOTIFY_BAD; } @@ -1160,7 +1186,8 @@ static struct notifier_block cpucache_notifier = { &cpuup_callback, NULL, 0 }; /* * swap the static kmem_list3 with kmalloced memory */ -static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list, int nodeid) +static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list, + int nodeid) { struct kmem_list3 *ptr; @@ -1175,8 +1202,9 @@ static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list, int no local_irq_enable(); } -/* Initialisation. - * Called after the gfp() functions have been enabled, and before smp_init(). +/* + * Initialisation. Called after the page allocator have been initialised and + * before smp_init(). */ void __init kmem_cache_init(void) { @@ -1201,9 +1229,9 @@ void __init kmem_cache_init(void) /* Bootstrap is tricky, because several objects are allocated * from caches that do not exist yet: - * 1) initialize the cache_cache cache: it contains the struct kmem_cache - * structures of all caches, except cache_cache itself: cache_cache - * is statically allocated. + * 1) initialize the cache_cache cache: it contains the struct + * kmem_cache structures of all caches, except cache_cache itself: + * cache_cache is statically allocated. * Initially an __init data area is used for the head array and the * kmem_list3 structures, it's replaced with a kmalloc allocated * array at the end of the bootstrap. @@ -1226,7 +1254,8 @@ void __init kmem_cache_init(void) cache_cache.array[smp_processor_id()] = &initarray_cache.cache; cache_cache.nodelists[numa_node_id()] = &initkmem_list3[CACHE_CACHE]; - cache_cache.buffer_size = ALIGN(cache_cache.buffer_size, cache_line_size()); + cache_cache.buffer_size = ALIGN(cache_cache.buffer_size, + cache_line_size()); for (order = 0; order < MAX_ORDER; order++) { cache_estimate(order, cache_cache.buffer_size, @@ -1245,24 +1274,26 @@ void __init kmem_cache_init(void) sizes = malloc_sizes; names = cache_names; - /* Initialize the caches that provide memory for the array cache - * and the kmem_list3 structures first. - * Without this, further allocations will bug + /* + * Initialize the caches that provide memory for the array cache and the + * kmem_list3 structures first. Without this, further allocations will + * bug. */ sizes[INDEX_AC].cs_cachep = kmem_cache_create(names[INDEX_AC].name, - sizes[INDEX_AC].cs_size, - ARCH_KMALLOC_MINALIGN, - (ARCH_KMALLOC_FLAGS | - SLAB_PANIC), NULL, NULL); + sizes[INDEX_AC].cs_size, + ARCH_KMALLOC_MINALIGN, + ARCH_KMALLOC_FLAGS|SLAB_PANIC, + NULL, NULL); - if (INDEX_AC != INDEX_L3) + if (INDEX_AC != INDEX_L3) { sizes[INDEX_L3].cs_cachep = - kmem_cache_create(names[INDEX_L3].name, - sizes[INDEX_L3].cs_size, - ARCH_KMALLOC_MINALIGN, - (ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL, - NULL); + kmem_cache_create(names[INDEX_L3].name, + sizes[INDEX_L3].cs_size, + ARCH_KMALLOC_MINALIGN, + ARCH_KMALLOC_FLAGS|SLAB_PANIC, + NULL, NULL); + } while (sizes->cs_size != ULONG_MAX) { /* @@ -1272,13 +1303,13 @@ void __init kmem_cache_init(void) * Note for systems short on memory removing the alignment will * allow tighter packing of the smaller caches. */ - if (!sizes->cs_cachep) + if (!sizes->cs_cachep) { sizes->cs_cachep = kmem_cache_create(names->name, - sizes->cs_size, - ARCH_KMALLOC_MINALIGN, - (ARCH_KMALLOC_FLAGS - | SLAB_PANIC), - NULL, NULL); + sizes->cs_size, + ARCH_KMALLOC_MINALIGN, + ARCH_KMALLOC_FLAGS|SLAB_PANIC, + NULL, NULL); + } /* Inc off-slab bufctl limit until the ceiling is hit. */ if (!(OFF_SLAB(sizes->cs_cachep))) { @@ -1287,13 +1318,11 @@ void __init kmem_cache_init(void) } sizes->cs_dmacachep = kmem_cache_create(names->name_dma, - sizes->cs_size, - ARCH_KMALLOC_MINALIGN, - (ARCH_KMALLOC_FLAGS | - SLAB_CACHE_DMA | - SLAB_PANIC), NULL, - NULL); - + sizes->cs_size, + ARCH_KMALLOC_MINALIGN, + ARCH_KMALLOC_FLAGS|SLAB_CACHE_DMA| + SLAB_PANIC, + NULL, NULL); sizes++; names++; } @@ -1345,20 +1374,22 @@ void __init kmem_cache_init(void) struct kmem_cache *cachep; mutex_lock(&cache_chain_mutex); list_for_each_entry(cachep, &cache_chain, next) - enable_cpucache(cachep); + enable_cpucache(cachep); mutex_unlock(&cache_chain_mutex); } /* Done! */ g_cpucache_up = FULL; - /* Register a cpu startup notifier callback - * that initializes cpu_cache_get for all new cpus + /* + * Register a cpu startup notifier callback that initializes + * cpu_cache_get for all new cpus */ register_cpu_notifier(&cpucache_notifier); - /* The reap timers are started later, with a module init call: - * That part of the kernel is not yet operational. + /* + * The reap timers are started later, with a module init call: That part + * of the kernel is not yet operational. */ } @@ -1366,16 +1397,13 @@ static int __init cpucache_init(void) { int cpu; - /* - * Register the timers that return unneeded - * pages to gfp. + /* + * Register the timers that return unneeded pages to the page allocator */ for_each_online_cpu(cpu) - start_cpu_timer(cpu); - + start_cpu_timer(cpu); return 0; } - __initcall(cpucache_init); /* @@ -1402,7 +1430,7 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) atomic_add(i, &slab_reclaim_pages); add_page_state(nr_slab, i); while (i--) { - SetPageSlab(page); + __SetPageSlab(page); page++; } return addr; @@ -1418,8 +1446,8 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr) const unsigned long nr_freed = i; while (i--) { - if (!TestClearPageSlab(page)) - BUG(); + BUG_ON(!PageSlab(page)); + __ClearPageSlab(page); page++; } sub_page_state(nr_slab, nr_freed); @@ -1489,9 +1517,8 @@ static void dump_line(char *data, int offset, int limit) { int i; printk(KERN_ERR "%03x:", offset); - for (i = 0; i < limit; i++) { + for (i = 0; i < limit; i++) printk(" %02x", (unsigned char)data[offset + i]); - } printk("\n"); } #endif @@ -1505,15 +1532,15 @@ static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines) if (cachep->flags & SLAB_RED_ZONE) { printk(KERN_ERR "Redzone: 0x%lx/0x%lx.\n", - *dbg_redzone1(cachep, objp), - *dbg_redzone2(cachep, objp)); + *dbg_redzone1(cachep, objp), + *dbg_redzone2(cachep, objp)); } if (cachep->flags & SLAB_STORE_USER) { printk(KERN_ERR "Last user: [<%p>]", - *dbg_userword(cachep, objp)); + *dbg_userword(cachep, objp)); print_symbol("(%s)", - (unsigned long)*dbg_userword(cachep, objp)); + (unsigned long)*dbg_userword(cachep, objp)); printk("\n"); } realobj = (char *)objp + obj_offset(cachep); @@ -1546,8 +1573,8 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp) /* Print header */ if (lines == 0) { printk(KERN_ERR - "Slab corruption: start=%p, len=%d\n", - realobj, size); + "Slab corruption: start=%p, len=%d\n", + realobj, size); print_objinfo(cachep, objp, 0); } /* Hexdump the affected line */ @@ -1568,18 +1595,18 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp) * exist: */ struct slab *slabp = virt_to_slab(objp); - int objnr; + unsigned int objnr; - objnr = (unsigned)(objp - slabp->s_mem) / cachep->buffer_size; + objnr = obj_to_index(cachep, slabp, objp); if (objnr) { - objp = slabp->s_mem + (objnr - 1) * cachep->buffer_size; + objp = index_to_obj(cachep, slabp, objnr - 1); realobj = (char *)objp + obj_offset(cachep); printk(KERN_ERR "Prev obj: start=%p, len=%d\n", realobj, size); print_objinfo(cachep, objp, 2); } if (objnr + 1 < cachep->num) { - objp = slabp->s_mem + (objnr + 1) * cachep->buffer_size; + objp = index_to_obj(cachep, slabp, objnr + 1); realobj = (char *)objp + obj_offset(cachep); printk(KERN_ERR "Next obj: start=%p, len=%d\n", realobj, size); @@ -1591,22 +1618,25 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp) #if DEBUG /** - * slab_destroy_objs - call the registered destructor for each object in - * a slab that is to be destroyed. + * slab_destroy_objs - destroy a slab and its objects + * @cachep: cache pointer being destroyed + * @slabp: slab pointer being destroyed + * + * Call the registered destructor for each object in a slab that is being + * destroyed. */ static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp) { int i; for (i = 0; i < cachep->num; i++) { - void *objp = slabp->s_mem + cachep->buffer_size * i; + void *objp = index_to_obj(cachep, slabp, i); if (cachep->flags & SLAB_POISON) { #ifdef CONFIG_DEBUG_PAGEALLOC - if ((cachep->buffer_size % PAGE_SIZE) == 0 - && OFF_SLAB(cachep)) + if (cachep->buffer_size % PAGE_SIZE == 0 && + OFF_SLAB(cachep)) kernel_map_pages(virt_to_page(objp), - cachep->buffer_size / PAGE_SIZE, - 1); + cachep->buffer_size / PAGE_SIZE, 1); else check_poison_obj(cachep, objp); #else @@ -1631,7 +1661,7 @@ static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp) if (cachep->dtor) { int i; for (i = 0; i < cachep->num; i++) { - void *objp = slabp->s_mem + cachep->buffer_size * i; + void *objp = index_to_obj(cachep, slabp, i); (cachep->dtor) (objp, cachep, 0); } } @@ -1639,9 +1669,13 @@ static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp) #endif /** + * slab_destroy - destroy and release all objects in a slab + * @cachep: cache pointer being destroyed + * @slabp: slab pointer being destroyed + * * Destroy all the objs in a slab, and release the mem back to the system. - * Before calling the slab must have been unlinked from the cache. - * The cache-lock is not held/needed. + * Before calling the slab must have been unlinked from the cache. The + * cache-lock is not held/needed. */ static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp) { @@ -1662,8 +1696,10 @@ static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp) } } -/* For setting up all the kmem_list3s for cache whose buffer_size is same - as size of kmem_list3. */ +/* + * For setting up all the kmem_list3s for cache whose buffer_size is same as + * size of kmem_list3. + */ static void set_up_list3s(struct kmem_cache *cachep, int index) { int node; @@ -1689,13 +1725,13 @@ static void set_up_list3s(struct kmem_cache *cachep, int index) * high order pages for slabs. When the gfp() functions are more friendly * towards high-order requests, this should be changed. */ -static inline size_t calculate_slab_order(struct kmem_cache *cachep, +static size_t calculate_slab_order(struct kmem_cache *cachep, size_t size, size_t align, unsigned long flags) { size_t left_over = 0; int gfporder; - for (gfporder = 0 ; gfporder <= MAX_GFP_ORDER; gfporder++) { + for (gfporder = 0; gfporder <= MAX_GFP_ORDER; gfporder++) { unsigned int num; size_t remainder; @@ -1730,12 +1766,66 @@ static inline size_t calculate_slab_order(struct kmem_cache *cachep, /* * Acceptable internal fragmentation? */ - if ((left_over * 8) <= (PAGE_SIZE << gfporder)) + if (left_over * 8 <= (PAGE_SIZE << gfporder)) break; } return left_over; } +static void setup_cpu_cache(struct kmem_cache *cachep) +{ + if (g_cpucache_up == FULL) { + enable_cpucache(cachep); + return; + } + if (g_cpucache_up == NONE) { + /* + * Note: the first kmem_cache_create must create the cache + * that's used by kmalloc(24), otherwise the creation of + * further caches will BUG(). + */ + cachep->array[smp_processor_id()] = &initarray_generic.cache; + + /* + * If the cache that's used by kmalloc(sizeof(kmem_list3)) is + * the first cache, then we need to set up all its list3s, + * otherwise the creation of further caches will BUG(). + */ + set_up_list3s(cachep, SIZE_AC); + if (INDEX_AC == INDEX_L3) + g_cpucache_up = PARTIAL_L3; + else + g_cpucache_up = PARTIAL_AC; + } else { + cachep->array[smp_processor_id()] = + kmalloc(sizeof(struct arraycache_init), GFP_KERNEL); + + if (g_cpucache_up == PARTIAL_AC) { + set_up_list3s(cachep, SIZE_L3); + g_cpucache_up = PARTIAL_L3; + } else { + int node; + for_each_online_node(node) { + cachep->nodelists[node] = + kmalloc_node(sizeof(struct kmem_list3), + GFP_KERNEL, node); + BUG_ON(!cachep->nodelists[node]); + kmem_list3_init(cachep->nodelists[node]); + } + } + } + cachep->nodelists[numa_node_id()]->next_reap = + jiffies + REAPTIMEOUT_LIST3 + + ((unsigned long)cachep) % REAPTIMEOUT_LIST3; + + cpu_cache_get(cachep)->avail = 0; + cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES; + cpu_cache_get(cachep)->batchcount = 1; + cpu_cache_get(cachep)->touched = 0; + cachep->batchcount = 1; + cachep->limit = BOOT_CPUCACHE_ENTRIES; +} + /** * kmem_cache_create - Create a cache. * @name: A string which is used in /proc/slabinfo to identify this cache. @@ -1751,9 +1841,8 @@ static inline size_t calculate_slab_order(struct kmem_cache *cachep, * and the @dtor is run before the pages are handed back. * * @name must be valid until the cache is destroyed. This implies that - * the module calling this has to destroy the cache before getting - * unloaded. - * + * the module calling this has to destroy the cache before getting unloaded. + * * The flags are * * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5) @@ -1762,16 +1851,14 @@ static inline size_t calculate_slab_order(struct kmem_cache *cachep, * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check * for buffer overruns. * - * %SLAB_NO_REAP - Don't automatically reap this cache when we're under - * memory pressure. - * * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware * cacheline. This can be beneficial if you're counting cycles as closely * as davem. */ struct kmem_cache * kmem_cache_create (const char *name, size_t size, size_t align, - unsigned long flags, void (*ctor)(void*, struct kmem_cache *, unsigned long), + unsigned long flags, + void (*ctor)(void*, struct kmem_cache *, unsigned long), void (*dtor)(void*, struct kmem_cache *, unsigned long)) { size_t left_over, slab_size, ralign; @@ -1781,12 +1868,10 @@ kmem_cache_create (const char *name, size_t size, size_t align, /* * Sanity checks... these are all serious usage bugs. */ - if ((!name) || - in_interrupt() || - (size < BYTES_PER_WORD) || + if (!name || in_interrupt() || (size < BYTES_PER_WORD) || (size > (1 << MAX_OBJ_ORDER) * PAGE_SIZE) || (dtor && !ctor)) { - printk(KERN_ERR "%s: Early error in slab %s\n", - __FUNCTION__, name); + printk(KERN_ERR "%s: Early error in slab %s\n", __FUNCTION__, + name); BUG(); } @@ -1840,8 +1925,7 @@ kmem_cache_create (const char *name, size_t size, size_t align, * above the next power of two: caches with object sizes just above a * power of two have a significant amount of internal fragmentation. */ - if ((size < 4096 - || fls(size - 1) == fls(size - 1 + 3 * BYTES_PER_WORD))) + if (size < 4096 || fls(size - 1) == fls(size-1 + 3 * BYTES_PER_WORD)) flags |= SLAB_RED_ZONE | SLAB_STORE_USER; if (!(flags & SLAB_DESTROY_BY_RCU)) flags |= SLAB_POISON; @@ -1853,13 +1937,14 @@ kmem_cache_create (const char *name, size_t size, size_t align, BUG_ON(dtor); /* - * Always checks flags, a caller might be expecting debug - * support which isn't available. + * Always checks flags, a caller might be expecting debug support which + * isn't available. */ if (flags & ~CREATE_MASK) BUG(); - /* Check that size is in terms of words. This is needed to avoid + /* + * Check that size is in terms of words. This is needed to avoid * unaligned accesses for some archs when redzoning is used, and makes * sure any on-slab bufctl's are also correctly aligned. */ @@ -1868,12 +1953,14 @@ kmem_cache_create (const char *name, size_t size, size_t align, size &= ~(BYTES_PER_WORD - 1); } - /* calculate out the final buffer alignment: */ + /* calculate the final buffer alignment: */ + /* 1) arch recommendation: can be overridden for debug */ if (flags & SLAB_HWCACHE_ALIGN) { - /* Default alignment: as specified by the arch code. - * Except if an object is really small, then squeeze multiple - * objects into one cacheline. + /* + * Default alignment: as specified by the arch code. Except if + * an object is really small, then squeeze multiple objects into + * one cacheline. */ ralign = cache_line_size(); while (size <= ralign / 2) @@ -1893,7 +1980,8 @@ kmem_cache_create (const char *name, size_t size, size_t align, if (ralign > BYTES_PER_WORD) flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); } - /* 4) Store it. Note that the debug code below can reduce + /* + * 4) Store it. Note that the debug code below can reduce * the alignment to BYTES_PER_WORD. */ align = ralign; @@ -1978,7 +2066,6 @@ kmem_cache_create (const char *name, size_t size, size_t align, cachep->gfpflags = 0; if (flags & SLAB_CACHE_DMA) cachep->gfpflags |= GFP_DMA; - spin_lock_init(&cachep->spinlock); cachep->buffer_size = size; if (flags & CFLGS_OFF_SLAB) @@ -1988,64 +2075,11 @@ kmem_cache_create (const char *name, size_t size, size_t align, cachep->name = name; - if (g_cpucache_up == FULL) { - enable_cpucache(cachep); - } else { - if (g_cpucache_up == NONE) { - /* Note: the first kmem_cache_create must create - * the cache that's used by kmalloc(24), otherwise - * the creation of further caches will BUG(). - */ - cachep->array[smp_processor_id()] = - &initarray_generic.cache; - - /* If the cache that's used by - * kmalloc(sizeof(kmem_list3)) is the first cache, - * then we need to set up all its list3s, otherwise - * the creation of further caches will BUG(). - */ - set_up_list3s(cachep, SIZE_AC); - if (INDEX_AC == INDEX_L3) - g_cpucache_up = PARTIAL_L3; - else - g_cpucache_up = PARTIAL_AC; - } else { - cachep->array[smp_processor_id()] = - kmalloc(sizeof(struct arraycache_init), GFP_KERNEL); - - if (g_cpucache_up == PARTIAL_AC) { - set_up_list3s(cachep, SIZE_L3); - g_cpucache_up = PARTIAL_L3; - } else { - int node; - for_each_online_node(node) { - - cachep->nodelists[node] = - kmalloc_node(sizeof - (struct kmem_list3), - GFP_KERNEL, node); - BUG_ON(!cachep->nodelists[node]); - kmem_list3_init(cachep-> - nodelists[node]); - } - } - } - cachep->nodelists[numa_node_id()]->next_reap = - jiffies + REAPTIMEOUT_LIST3 + - ((unsigned long)cachep) % REAPTIMEOUT_LIST3; - - BUG_ON(!cpu_cache_get(cachep)); - cpu_cache_get(cachep)->avail = 0; - cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES; - cpu_cache_get(cachep)->batchcount = 1; - cpu_cache_get(cachep)->touched = 0; - cachep->batchcount = 1; - cachep->limit = BOOT_CPUCACHE_ENTRIES; - } + setup_cpu_cache(cachep); /* cache setup completed, link it into the list */ list_add(&cachep->next, &cache_chain); - oops: +oops: if (!cachep && (flags & SLAB_PANIC)) panic("kmem_cache_create(): failed to create slab `%s'\n", name); @@ -2089,30 +2123,13 @@ static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node) #define check_spinlock_acquired_node(x, y) do { } while(0) #endif -/* - * Waits for all CPUs to execute func(). - */ -static void smp_call_function_all_cpus(void (*func)(void *arg), void *arg) -{ - check_irq_on(); - preempt_disable(); - - local_irq_disable(); - func(arg); - local_irq_enable(); - - if (smp_call_function(func, arg, 1, 1)) - BUG(); - - preempt_enable(); -} - -static void drain_array_locked(struct kmem_cache *cachep, struct array_cache *ac, - int force, int node); +static void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3, + struct array_cache *ac, + int force, int node); static void do_drain(void *arg) { - struct kmem_cache *cachep = (struct kmem_cache *) arg; + struct kmem_cache *cachep = arg; struct array_cache *ac; int node = numa_node_id(); @@ -2129,14 +2146,12 @@ static void drain_cpu_caches(struct kmem_cache *cachep) struct kmem_list3 *l3; int node; - smp_call_function_all_cpus(do_drain, cachep); + on_each_cpu(do_drain, cachep, 1, 1); check_irq_on(); for_each_online_node(node) { l3 = cachep->nodelists[node]; if (l3) { - spin_lock_irq(&l3->list_lock); - drain_array_locked(cachep, l3->shared, 1, node); - spin_unlock_irq(&l3->list_lock); + drain_array(cachep, l3, l3->shared, 1, node); if (l3->alien) drain_alien_cache(cachep, l3->alien); } @@ -2260,16 +2275,15 @@ int kmem_cache_destroy(struct kmem_cache *cachep) /* NUMA: free the list3 structures */ for_each_online_node(i) { - if ((l3 = cachep->nodelists[i])) { + l3 = cachep->nodelists[i]; + if (l3) { kfree(l3->shared); free_alien_cache(l3->alien); kfree(l3); } } kmem_cache_free(&cache_cache, cachep); - unlock_cpu_hotplug(); - return 0; } EXPORT_SYMBOL(kmem_cache_destroy); @@ -2292,7 +2306,6 @@ static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp, slabp->inuse = 0; slabp->colouroff = colour_off; slabp->s_mem = objp + colour_off; - return slabp; } @@ -2307,7 +2320,7 @@ static void cache_init_objs(struct kmem_cache *cachep, int i; for (i = 0; i < cachep->num; i++) { - void *objp = slabp->s_mem + cachep->buffer_size * i; + void *objp = index_to_obj(cachep, slabp, i); #if DEBUG /* need to poison the objs? */ if (cachep->flags & SLAB_POISON) @@ -2320,9 +2333,9 @@ static void cache_init_objs(struct kmem_cache *cachep, *dbg_redzone2(cachep, objp) = RED_INACTIVE; } /* - * Constructors are not allowed to allocate memory from - * the same cache which they are a constructor for. - * Otherwise, deadlock. They must also be threaded. + * Constructors are not allowed to allocate memory from the same + * cache which they are a constructor for. Otherwise, deadlock. + * They must also be threaded. */ if (cachep->ctor && !(cachep->flags & SLAB_POISON)) cachep->ctor(objp + obj_offset(cachep), cachep, @@ -2336,8 +2349,8 @@ static void cache_init_objs(struct kmem_cache *cachep, slab_error(cachep, "constructor overwrote the" " start of an object"); } - if ((cachep->buffer_size % PAGE_SIZE) == 0 && OFF_SLAB(cachep) - && cachep->flags & SLAB_POISON) + if ((cachep->buffer_size % PAGE_SIZE) == 0 && + OFF_SLAB(cachep) && cachep->flags & SLAB_POISON) kernel_map_pages(virt_to_page(objp), cachep->buffer_size / PAGE_SIZE, 0); #else @@ -2352,18 +2365,16 @@ static void cache_init_objs(struct kmem_cache *cachep, static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags) { - if (flags & SLAB_DMA) { - if (!(cachep->gfpflags & GFP_DMA)) - BUG(); - } else { - if (cachep->gfpflags & GFP_DMA) - BUG(); - } + if (flags & SLAB_DMA) + BUG_ON(!(cachep->gfpflags & GFP_DMA)); + else + BUG_ON(cachep->gfpflags & GFP_DMA); } -static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slabp, int nodeid) +static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slabp, + int nodeid) { - void *objp = slabp->s_mem + (slabp->free * cachep->buffer_size); + void *objp = index_to_obj(cachep, slabp, slabp->free); kmem_bufctl_t next; slabp->inuse++; @@ -2377,10 +2388,10 @@ static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slabp, int nod return objp; } -static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp, void *objp, - int nodeid) +static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp, + void *objp, int nodeid) { - unsigned int objnr = (unsigned)(objp-slabp->s_mem) / cachep->buffer_size; + unsigned int objnr = obj_to_index(cachep, slabp, objp); #if DEBUG /* Verify that the slab belongs to the intended node */ @@ -2388,7 +2399,7 @@ static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp, void *ob if (slab_bufctl(slabp)[objnr] != BUFCTL_FREE) { printk(KERN_ERR "slab: double free detected in cache " - "'%s', objp %p\n", cachep->name, objp); + "'%s', objp %p\n", cachep->name, objp); BUG(); } #endif @@ -2397,14 +2408,18 @@ static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp, void *ob slabp->inuse--; } -static void set_slab_attr(struct kmem_cache *cachep, struct slab *slabp, void *objp) +static void set_slab_attr(struct kmem_cache *cachep, struct slab *slabp, + void *objp) { int i; struct page *page; /* Nasty!!!!!! I hope this is OK. */ - i = 1 << cachep->gfporder; page = virt_to_page(objp); + + i = 1; + if (likely(!PageCompound(page))) + i <<= cachep->gfporder; do { page_set_cache(page, cachep); page_set_slab(page, slabp); @@ -2425,8 +2440,9 @@ static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid) unsigned long ctor_flags; struct kmem_list3 *l3; - /* Be lazy and only check for valid flags here, - * keeping it out of the critical path in kmem_cache_alloc(). + /* + * Be lazy and only check for valid flags here, keeping it out of the + * critical path in kmem_cache_alloc(). */ if (flags & ~(SLAB_DMA | SLAB_LEVEL_MASK | SLAB_NO_GROW)) BUG(); @@ -2467,14 +2483,17 @@ static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid) */ kmem_flagcheck(cachep, flags); - /* Get mem for the objs. - * Attempt to allocate a physical page from 'nodeid', + /* + * Get mem for the objs. Attempt to allocate a physical page from + * 'nodeid'. */ - if (!(objp = kmem_getpages(cachep, flags, nodeid))) + objp = kmem_getpages(cachep, flags, nodeid); + if (!objp) goto failed; /* Get slab management. */ - if (!(slabp = alloc_slabmgmt(cachep, objp, offset, local_flags))) + slabp = alloc_slabmgmt(cachep, objp, offset, local_flags); + if (!slabp) goto opps1; slabp->nodeid = nodeid; @@ -2493,9 +2512,9 @@ static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid) l3->free_objects += cachep->num; spin_unlock(&l3->list_lock); return 1; - opps1: +opps1: kmem_freepages(cachep, objp); - failed: +failed: if (local_flags & __GFP_WAIT) local_irq_disable(); return 0; @@ -2538,8 +2557,8 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, page = virt_to_page(objp); if (page_get_cache(page) != cachep) { - printk(KERN_ERR - "mismatch in kmem_cache_free: expected cache %p, got %p\n", + printk(KERN_ERR "mismatch in kmem_cache_free: expected " + "cache %p, got %p\n", page_get_cache(page), cachep); printk(KERN_ERR "%p is %s.\n", cachep, cachep->name); printk(KERN_ERR "%p is %s.\n", page_get_cache(page), @@ -2549,13 +2568,12 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, slabp = page_get_slab(page); if (cachep->flags & SLAB_RED_ZONE) { - if (*dbg_redzone1(cachep, objp) != RED_ACTIVE - || *dbg_redzone2(cachep, objp) != RED_ACTIVE) { - slab_error(cachep, - "double free, or memory outside" - " object was overwritten"); - printk(KERN_ERR - "%p: redzone 1: 0x%lx, redzone 2: 0x%lx.\n", + if (*dbg_redzone1(cachep, objp) != RED_ACTIVE || + *dbg_redzone2(cachep, objp) != RED_ACTIVE) { + slab_error(cachep, "double free, or memory outside" + " object was overwritten"); + printk(KERN_ERR "%p: redzone 1:0x%lx, " + "redzone 2:0x%lx.\n", objp, *dbg_redzone1(cachep, objp), *dbg_redzone2(cachep, objp)); } @@ -2565,15 +2583,16 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, if (cachep->flags & SLAB_STORE_USER) *dbg_userword(cachep, objp) = caller; - objnr = (unsigned)(objp - slabp->s_mem) / cachep->buffer_size; + objnr = obj_to_index(cachep, slabp, objp); BUG_ON(objnr >= cachep->num); - BUG_ON(objp != slabp->s_mem + objnr * cachep->buffer_size); + BUG_ON(objp != index_to_obj(cachep, slabp, objnr)); if (cachep->flags & SLAB_DEBUG_INITIAL) { - /* Need to call the slab's constructor so the - * caller can perform a verify of its state (debugging). - * Called without the cache-lock held. + /* + * Need to call the slab's constructor so the caller can + * perform a verify of its state (debugging). Called without + * the cache-lock held. */ cachep->ctor(objp + obj_offset(cachep), cachep, SLAB_CTOR_CONSTRUCTOR | SLAB_CTOR_VERIFY); @@ -2586,7 +2605,7 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, } if (cachep->flags & SLAB_POISON) { #ifdef CONFIG_DEBUG_PAGEALLOC - if ((cachep->buffer_size % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) { + if ((cachep->buffer_size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) { store_stackinfo(cachep, objp, (unsigned long)caller); kernel_map_pages(virt_to_page(objp), cachep->buffer_size / PAGE_SIZE, 0); @@ -2612,14 +2631,14 @@ static void check_slabp(struct kmem_cache *cachep, struct slab *slabp) goto bad; } if (entries != cachep->num - slabp->inuse) { - bad: - printk(KERN_ERR - "slab: Internal list corruption detected in cache '%s'(%d), slabp %p(%d). Hexdump:\n", - cachep->name, cachep->num, slabp, slabp->inuse); +bad: + printk(KERN_ERR "slab: Internal list corruption detected in " + "cache '%s'(%d), slabp %p(%d). Hexdump:\n", + cachep->name, cachep->num, slabp, slabp->inuse); for (i = 0; i < sizeof(*slabp) + cachep->num * sizeof(kmem_bufctl_t); i++) { - if ((i % 16) == 0) + if (i % 16 == 0) printk("\n%03x:", i); printk(" %02x", ((unsigned char *)slabp)[i]); } @@ -2641,12 +2660,13 @@ static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags) check_irq_off(); ac = cpu_cache_get(cachep); - retry: +retry: batchcount = ac->batchcount; if (!ac->touched && batchcount > BATCHREFILL_LIMIT) { - /* if there was little recent activity on this - * cache, then perform only a partial refill. - * Otherwise we could generate refill bouncing. + /* + * If there was little recent activity on this cache, then + * perform only a partial refill. Otherwise we could generate + * refill bouncing. */ batchcount = BATCHREFILL_LIMIT; } @@ -2702,29 +2722,29 @@ static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags) list_add(&slabp->list, &l3->slabs_partial); } - must_grow: +must_grow: l3->free_objects -= ac->avail; - alloc_done: +alloc_done: spin_unlock(&l3->list_lock); if (unlikely(!ac->avail)) { int x; x = cache_grow(cachep, flags, numa_node_id()); - // cache_grow can reenable interrupts, then ac could change. + /* cache_grow can reenable interrupts, then ac could change. */ ac = cpu_cache_get(cachep); - if (!x && ac->avail == 0) // no objects in sight? abort + if (!x && ac->avail == 0) /* no objects in sight? abort */ return NULL; - if (!ac->avail) // objects refilled by interrupt? + if (!ac->avail) /* objects refilled by interrupt? */ goto retry; } ac->touched = 1; return ac->entry[--ac->avail]; } -static inline void -cache_alloc_debugcheck_before(struct kmem_cache *cachep, gfp_t flags) +static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep, + gfp_t flags) { might_sleep_if(flags & __GFP_WAIT); #if DEBUG @@ -2733,8 +2753,8 @@ cache_alloc_debugcheck_before(struct kmem_cache *cachep, gfp_t flags) } #if DEBUG -static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, gfp_t flags, - void *objp, void *caller) +static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, + gfp_t flags, void *objp, void *caller) { if (!objp) return objp; @@ -2754,15 +2774,14 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, gfp_t flags *dbg_userword(cachep, objp) = caller; if (cachep->flags & SLAB_RED_ZONE) { - if (*dbg_redzone1(cachep, objp) != RED_INACTIVE - || *dbg_redzone2(cachep, objp) != RED_INACTIVE) { - slab_error(cachep, - "double free, or memory outside" - " object was overwritten"); + if (*dbg_redzone1(cachep, objp) != RED_INACTIVE || + *dbg_redzone2(cachep, objp) != RED_INACTIVE) { + slab_error(cachep, "double free, or memory outside" + " object was overwritten"); printk(KERN_ERR - "%p: redzone 1: 0x%lx, redzone 2: 0x%lx.\n", - objp, *dbg_redzone1(cachep, objp), - *dbg_redzone2(cachep, objp)); + "%p: redzone 1:0x%lx, redzone 2:0x%lx\n", + objp, *dbg_redzone1(cachep, objp), + *dbg_redzone2(cachep, objp)); } *dbg_redzone1(cachep, objp) = RED_ACTIVE; *dbg_redzone2(cachep, objp) = RED_ACTIVE; @@ -2809,8 +2828,8 @@ static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) return objp; } -static __always_inline void * -__cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller) +static __always_inline void *__cache_alloc(struct kmem_cache *cachep, + gfp_t flags, void *caller) { unsigned long save_flags; void *objp; @@ -2830,7 +2849,8 @@ __cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller) /* * A interface to enable slab creation on nodeid */ -static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) +static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, + int nodeid) { struct list_head *entry; struct slab *slabp; @@ -2841,7 +2861,7 @@ static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int node l3 = cachep->nodelists[nodeid]; BUG_ON(!l3); - retry: +retry: check_irq_off(); spin_lock(&l3->list_lock); entry = l3->slabs_partial.next; @@ -2868,16 +2888,15 @@ static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int node /* move slabp to correct slabp list: */ list_del(&slabp->list); - if (slabp->free == BUFCTL_END) { + if (slabp->free == BUFCTL_END) list_add(&slabp->list, &l3->slabs_full); - } else { + else list_add(&slabp->list, &l3->slabs_partial); - } spin_unlock(&l3->list_lock); goto done; - must_grow: +must_grow: spin_unlock(&l3->list_lock); x = cache_grow(cachep, flags, nodeid); @@ -2885,7 +2904,7 @@ static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int node return NULL; goto retry; - done: +done: return obj; } #endif @@ -2958,7 +2977,7 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac) } free_block(cachep, ac->entry, batchcount, node); - free_done: +free_done: #if STATS { int i = 0; @@ -2979,16 +2998,12 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac) #endif spin_unlock(&l3->list_lock); ac->avail -= batchcount; - memmove(ac->entry, &(ac->entry[batchcount]), - sizeof(void *) * ac->avail); + memmove(ac->entry, &(ac->entry[batchcount]), sizeof(void *)*ac->avail); } /* - * __cache_free - * Release an obj back to its cache. If the obj has a constructed - * state, it must be in this state _before_ it is released. - * - * Called with disabled ints. + * Release an obj back to its cache. If the obj has a constructed state, it must + * be in this state _before_ it is released. Called with disabled ints. */ static inline void __cache_free(struct kmem_cache *cachep, void *objp) { @@ -3007,9 +3022,9 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp) if (unlikely(slabp->nodeid != numa_node_id())) { struct array_cache *alien = NULL; int nodeid = slabp->nodeid; - struct kmem_list3 *l3 = - cachep->nodelists[numa_node_id()]; + struct kmem_list3 *l3; + l3 = cachep->nodelists[numa_node_id()]; STATS_INC_NODEFREES(cachep); if (l3->alien && l3->alien[nodeid]) { alien = l3->alien[nodeid]; @@ -3093,7 +3108,7 @@ int fastcall kmem_ptr_validate(struct kmem_cache *cachep, void *ptr) if (unlikely(page_get_cache(page) != cachep)) goto out; return 1; - out: +out: return 0; } @@ -3119,7 +3134,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) local_irq_save(save_flags); if (nodeid == -1 || nodeid == numa_node_id() || - !cachep->nodelists[nodeid]) + !cachep->nodelists[nodeid]) ptr = ____cache_alloc(cachep, flags); else ptr = __cache_alloc_node(cachep, flags, nodeid); @@ -3148,6 +3163,7 @@ EXPORT_SYMBOL(kmalloc_node); * kmalloc - allocate memory * @size: how many bytes of memory are required. * @flags: the type of memory to allocate. + * @caller: function caller for debug tracking of the caller * * kmalloc is the normal method of allocating memory * in the kernel. @@ -3236,7 +3252,7 @@ void *__alloc_percpu(size_t size) /* Catch derefs w/o wrappers */ return (void *)(~(unsigned long)pdata); - unwind_oom: +unwind_oom: while (--i >= 0) { if (!cpu_possible(i)) continue; @@ -3339,18 +3355,20 @@ static int alloc_kmemlist(struct kmem_cache *cachep) struct array_cache *nc = NULL, *new; struct array_cache **new_alien = NULL; #ifdef CONFIG_NUMA - if (!(new_alien = alloc_alien_cache(node, cachep->limit))) + new_alien = alloc_alien_cache(node, cachep->limit); + if (!new_alien) goto fail; #endif - if (!(new = alloc_arraycache(node, (cachep->shared * - cachep->batchcount), - 0xbaadf00d))) + new = alloc_arraycache(node, cachep->shared*cachep->batchcount, + 0xbaadf00d); + if (!new) goto fail; - if ((l3 = cachep->nodelists[node])) { - + l3 = cachep->nodelists[node]; + if (l3) { spin_lock_irq(&l3->list_lock); - if ((nc = cachep->nodelists[node]->shared)) + nc = cachep->nodelists[node]->shared; + if (nc) free_block(cachep, nc->entry, nc->avail, node); l3->shared = new; @@ -3359,27 +3377,27 @@ static int alloc_kmemlist(struct kmem_cache *cachep) new_alien = NULL; } l3->free_limit = (1 + nr_cpus_node(node)) * - cachep->batchcount + cachep->num; + cachep->batchcount + cachep->num; spin_unlock_irq(&l3->list_lock); kfree(nc); free_alien_cache(new_alien); continue; } - if (!(l3 = kmalloc_node(sizeof(struct kmem_list3), - GFP_KERNEL, node))) + l3 = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, node); + if (!l3) goto fail; kmem_list3_init(l3); l3->next_reap = jiffies + REAPTIMEOUT_LIST3 + - ((unsigned long)cachep) % REAPTIMEOUT_LIST3; + ((unsigned long)cachep) % REAPTIMEOUT_LIST3; l3->shared = new; l3->alien = new_alien; l3->free_limit = (1 + nr_cpus_node(node)) * - cachep->batchcount + cachep->num; + cachep->batchcount + cachep->num; cachep->nodelists[node] = l3; } return err; - fail: +fail: err = -ENOMEM; return err; } @@ -3391,7 +3409,7 @@ struct ccupdate_struct { static void do_ccupdate_local(void *info) { - struct ccupdate_struct *new = (struct ccupdate_struct *)info; + struct ccupdate_struct *new = info; struct array_cache *old; check_irq_off(); @@ -3401,16 +3419,17 @@ static void do_ccupdate_local(void *info) new->new[smp_processor_id()] = old; } -static int do_tune_cpucache(struct kmem_cache *cachep, int limit, int batchcount, - int shared) +/* Always called with the cache_chain_mutex held */ +static int do_tune_cpucache(struct kmem_cache *cachep, int limit, + int batchcount, int shared) { struct ccupdate_struct new; int i, err; memset(&new.new, 0, sizeof(new.new)); for_each_online_cpu(i) { - new.new[i] = - alloc_arraycache(cpu_to_node(i), limit, batchcount); + new.new[i] = alloc_arraycache(cpu_to_node(i), limit, + batchcount); if (!new.new[i]) { for (i--; i >= 0; i--) kfree(new.new[i]); @@ -3419,14 +3438,12 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, int batchcount } new.cachep = cachep; - smp_call_function_all_cpus(do_ccupdate_local, (void *)&new); + on_each_cpu(do_ccupdate_local, (void *)&new, 1, 1); check_irq_on(); - spin_lock(&cachep->spinlock); cachep->batchcount = batchcount; cachep->limit = limit; cachep->shared = shared; - spin_unlock(&cachep->spinlock); for_each_online_cpu(i) { struct array_cache *ccold = new.new[i]; @@ -3447,15 +3464,17 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, int batchcount return 0; } +/* Called with cache_chain_mutex held always */ static void enable_cpucache(struct kmem_cache *cachep) { int err; int limit, shared; - /* The head array serves three purposes: + /* + * The head array serves three purposes: * - create a LIFO ordering, i.e. return objects that are cache-warm * - reduce the number of spinlock operations. - * - reduce the number of linked list operations on the slab and + * - reduce the number of linked list operations on the slab and * bufctl chains: array operations are cheaper. * The numbers are guessed, we should auto-tune as described by * Bonwick. @@ -3471,7 +3490,8 @@ static void enable_cpucache(struct kmem_cache *cachep) else limit = 120; - /* Cpu bound tasks (e.g. network routing) can exhibit cpu bound + /* + * CPU bound tasks (e.g. network routing) can exhibit cpu bound * allocation behaviour: Most allocs on one cpu, most free operations * on another cpu. For these cases, an efficient object passing between * cpus is necessary. This is provided by a shared array. The array @@ -3486,9 +3506,9 @@ static void enable_cpucache(struct kmem_cache *cachep) #endif #if DEBUG - /* With debugging enabled, large batchcount lead to excessively - * long periods with disabled local interrupts. Limit the - * batchcount + /* + * With debugging enabled, large batchcount lead to excessively long + * periods with disabled local interrupts. Limit the batchcount */ if (limit > 32) limit = 32; @@ -3499,23 +3519,32 @@ static void enable_cpucache(struct kmem_cache *cachep) cachep->name, -err); } -static void drain_array_locked(struct kmem_cache *cachep, struct array_cache *ac, - int force, int node) +/* + * Drain an array if it contains any elements taking the l3 lock only if + * necessary. Note that the l3 listlock also protects the array_cache + * if drain_array() is used on the shared array. + */ +void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3, + struct array_cache *ac, int force, int node) { int tofree; - check_spinlock_acquired_node(cachep, node); + if (!ac || !ac->avail) + return; if (ac->touched && !force) { ac->touched = 0; - } else if (ac->avail) { - tofree = force ? ac->avail : (ac->limit + 4) / 5; - if (tofree > ac->avail) { - tofree = (ac->avail + 1) / 2; + } else { + spin_lock_irq(&l3->list_lock); + if (ac->avail) { + tofree = force ? ac->avail : (ac->limit + 4) / 5; + if (tofree > ac->avail) + tofree = (ac->avail + 1) / 2; + free_block(cachep, ac->entry, tofree, node); + ac->avail -= tofree; + memmove(ac->entry, &(ac->entry[tofree]), + sizeof(void *) * ac->avail); } - free_block(cachep, ac->entry, tofree, node); - ac->avail -= tofree; - memmove(ac->entry, &(ac->entry[tofree]), - sizeof(void *) * ac->avail); + spin_unlock_irq(&l3->list_lock); } } @@ -3528,13 +3557,14 @@ static void drain_array_locked(struct kmem_cache *cachep, struct array_cache *ac * - clear the per-cpu caches for this CPU. * - return freeable pages to the main free memory pool. * - * If we cannot acquire the cache chain mutex then just give up - we'll - * try again on the next iteration. + * If we cannot acquire the cache chain mutex then just give up - we'll try + * again on the next iteration. */ static void cache_reap(void *unused) { struct list_head *walk; struct kmem_list3 *l3; + int node = numa_node_id(); if (!mutex_trylock(&cache_chain_mutex)) { /* Give up. Setup the next iteration. */ @@ -3550,65 +3580,72 @@ static void cache_reap(void *unused) struct slab *slabp; searchp = list_entry(walk, struct kmem_cache, next); - - if (searchp->flags & SLAB_NO_REAP) - goto next; - check_irq_on(); - l3 = searchp->nodelists[numa_node_id()]; + /* + * We only take the l3 lock if absolutely necessary and we + * have established with reasonable certainty that + * we can do some work if the lock was obtained. + */ + l3 = searchp->nodelists[node]; + reap_alien(searchp, l3); - spin_lock_irq(&l3->list_lock); - drain_array_locked(searchp, cpu_cache_get(searchp), 0, - numa_node_id()); + drain_array(searchp, l3, cpu_cache_get(searchp), 0, node); + /* + * These are racy checks but it does not matter + * if we skip one check or scan twice. + */ if (time_after(l3->next_reap, jiffies)) - goto next_unlock; + goto next; l3->next_reap = jiffies + REAPTIMEOUT_LIST3; - if (l3->shared) - drain_array_locked(searchp, l3->shared, 0, - numa_node_id()); + drain_array(searchp, l3, l3->shared, 0, node); if (l3->free_touched) { l3->free_touched = 0; - goto next_unlock; + goto next; } - tofree = - (l3->free_limit + 5 * searchp->num - - 1) / (5 * searchp->num); + tofree = (l3->free_limit + 5 * searchp->num - 1) / + (5 * searchp->num); do { + /* + * Do not lock if there are no free blocks. + */ + if (list_empty(&l3->slabs_free)) + break; + + spin_lock_irq(&l3->list_lock); p = l3->slabs_free.next; - if (p == &(l3->slabs_free)) + if (p == &(l3->slabs_free)) { + spin_unlock_irq(&l3->list_lock); break; + } slabp = list_entry(p, struct slab, list); BUG_ON(slabp->inuse); list_del(&slabp->list); STATS_INC_REAPED(searchp); - /* Safe to drop the lock. The slab is no longer - * linked to the cache. - * searchp cannot disappear, we hold + /* + * Safe to drop the lock. The slab is no longer linked + * to the cache. searchp cannot disappear, we hold * cache_chain_lock */ l3->free_objects -= searchp->num; spin_unlock_irq(&l3->list_lock); slab_destroy(searchp, slabp); - spin_lock_irq(&l3->list_lock); } while (--tofree > 0); - next_unlock: - spin_unlock_irq(&l3->list_lock); - next: +next: cond_resched(); } check_irq_on(); mutex_unlock(&cache_chain_mutex); next_reap_node(); - /* Setup the next iteration */ + /* Set up the next iteration */ schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC); } @@ -3658,8 +3695,8 @@ static void *s_next(struct seq_file *m, void *p, loff_t *pos) { struct kmem_cache *cachep = p; ++*pos; - return cachep->next.next == &cache_chain ? NULL - : list_entry(cachep->next.next, struct kmem_cache, next); + return cachep->next.next == &cache_chain ? + NULL : list_entry(cachep->next.next, struct kmem_cache, next); } static void s_stop(struct seq_file *m, void *p) @@ -3681,7 +3718,6 @@ static int s_show(struct seq_file *m, void *p) int node; struct kmem_list3 *l3; - spin_lock(&cachep->spinlock); active_objs = 0; num_slabs = 0; for_each_online_node(node) { @@ -3748,7 +3784,9 @@ static int s_show(struct seq_file *m, void *p) unsigned long node_frees = cachep->node_frees; seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu \ - %4lu %4lu %4lu %4lu", allocs, high, grown, reaped, errors, max_freeable, node_allocs, node_frees); + %4lu %4lu %4lu %4lu", allocs, high, grown, + reaped, errors, max_freeable, node_allocs, + node_frees); } /* cpu stats */ { @@ -3762,7 +3800,6 @@ static int s_show(struct seq_file *m, void *p) } #endif seq_putc(m, '\n'); - spin_unlock(&cachep->spinlock); return 0; } @@ -3820,13 +3857,12 @@ ssize_t slabinfo_write(struct file *file, const char __user * buffer, mutex_lock(&cache_chain_mutex); res = -EINVAL; list_for_each(p, &cache_chain) { - struct kmem_cache *cachep = list_entry(p, struct kmem_cache, - next); + struct kmem_cache *cachep; + cachep = list_entry(p, struct kmem_cache, next); if (!strcmp(cachep->name, kbuf)) { - if (limit < 1 || - batchcount < 1 || - batchcount > limit || shared < 0) { + if (limit < 1 || batchcount < 1 || + batchcount > limit || shared < 0) { res = 0; } else { res = do_tune_cpucache(cachep, limit, diff --git a/mm/swap.c b/mm/swap.c index b524ea90bdd..91b7e2026f6 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -209,19 +209,18 @@ int lru_add_drain_all(void) */ void fastcall __page_cache_release(struct page *page) { - unsigned long flags; - struct zone *zone = page_zone(page); + if (PageLRU(page)) { + unsigned long flags; + struct zone *zone = page_zone(page); - spin_lock_irqsave(&zone->lru_lock, flags); - if (TestClearPageLRU(page)) + spin_lock_irqsave(&zone->lru_lock, flags); + BUG_ON(!PageLRU(page)); + __ClearPageLRU(page); del_page_from_lru(zone, page); - if (page_count(page) != 0) - page = NULL; - spin_unlock_irqrestore(&zone->lru_lock, flags); - if (page) - free_hot_page(page); + spin_unlock_irqrestore(&zone->lru_lock, flags); + } + free_hot_page(page); } - EXPORT_SYMBOL(__page_cache_release); /* @@ -245,7 +244,6 @@ void release_pages(struct page **pages, int nr, int cold) pagevec_init(&pages_to_free, cold); for (i = 0; i < nr; i++) { struct page *page = pages[i]; - struct zone *pagezone; if (unlikely(PageCompound(page))) { if (zone) { @@ -259,23 +257,27 @@ void release_pages(struct page **pages, int nr, int cold) if (!put_page_testzero(page)) continue; - pagezone = page_zone(page); - if (pagezone != zone) { - if (zone) - spin_unlock_irq(&zone->lru_lock); - zone = pagezone; - spin_lock_irq(&zone->lru_lock); - } - if (TestClearPageLRU(page)) + if (PageLRU(page)) { + struct zone *pagezone = page_zone(page); + if (pagezone != zone) { + if (zone) + spin_unlock_irq(&zone->lru_lock); + zone = pagezone; + spin_lock_irq(&zone->lru_lock); + } + BUG_ON(!PageLRU(page)); + __ClearPageLRU(page); del_page_from_lru(zone, page); - if (page_count(page) == 0) { - if (!pagevec_add(&pages_to_free, page)) { + } + + if (!pagevec_add(&pages_to_free, page)) { + if (zone) { spin_unlock_irq(&zone->lru_lock); - __pagevec_free(&pages_to_free); - pagevec_reinit(&pages_to_free); - zone = NULL; /* No lock is held */ + zone = NULL; } - } + __pagevec_free(&pages_to_free); + pagevec_reinit(&pages_to_free); + } } if (zone) spin_unlock_irq(&zone->lru_lock); @@ -343,8 +345,8 @@ void __pagevec_lru_add(struct pagevec *pvec) zone = pagezone; spin_lock_irq(&zone->lru_lock); } - if (TestSetPageLRU(page)) - BUG(); + BUG_ON(PageLRU(page)); + SetPageLRU(page); add_page_to_inactive_list(zone, page); } if (zone) @@ -370,10 +372,10 @@ void __pagevec_lru_add_active(struct pagevec *pvec) zone = pagezone; spin_lock_irq(&zone->lru_lock); } - if (TestSetPageLRU(page)) - BUG(); - if (TestSetPageActive(page)) - BUG(); + BUG_ON(PageLRU(page)); + SetPageLRU(page); + BUG_ON(PageActive(page)); + SetPageActive(page); add_page_to_active_list(zone, page); } if (zone) diff --git a/mm/swap_state.c b/mm/swap_state.c index db8a3d3e163..d7af296833f 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -15,6 +15,7 @@ #include <linux/buffer_head.h> #include <linux/backing-dev.h> #include <linux/pagevec.h> +#include <linux/migrate.h> #include <asm/pgtable.h> diff --git a/mm/swapfile.c b/mm/swapfile.c index 1f9cf0d073b..365ed6ff182 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -116,7 +116,7 @@ static inline unsigned long scan_swap_map(struct swap_info_struct *si) last_in_cluster = offset + SWAPFILE_CLUSTER; else if (offset == last_in_cluster) { spin_lock(&swap_lock); - si->cluster_next = offset-SWAPFILE_CLUSTER-1; + si->cluster_next = offset-SWAPFILE_CLUSTER+1; goto cluster; } if (unlikely(--latency_ration < 0)) { diff --git a/mm/vmscan.c b/mm/vmscan.c index 4fe7e3aa02e..fd572bbdc9f 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -33,39 +33,21 @@ #include <linux/cpuset.h> #include <linux/notifier.h> #include <linux/rwsem.h> +#include <linux/delay.h> #include <asm/tlbflush.h> #include <asm/div64.h> #include <linux/swapops.h> -/* possible outcome of pageout() */ -typedef enum { - /* failed to write page out, page is locked */ - PAGE_KEEP, - /* move page to the active list, page is locked */ - PAGE_ACTIVATE, - /* page has been sent to the disk successfully, page is unlocked */ - PAGE_SUCCESS, - /* page is clean and locked */ - PAGE_CLEAN, -} pageout_t; +#include "internal.h" struct scan_control { - /* Ask refill_inactive_zone, or shrink_cache to scan this many pages */ - unsigned long nr_to_scan; - /* Incremented by the number of inactive pages that were scanned */ unsigned long nr_scanned; - /* Incremented by the number of pages reclaimed */ - unsigned long nr_reclaimed; - unsigned long nr_mapped; /* From page_state */ - /* Ask shrink_caches, or shrink_zone to scan at this priority */ - unsigned int priority; - /* This context's GFP mask */ gfp_t gfp_mask; @@ -183,10 +165,11 @@ EXPORT_SYMBOL(remove_shrinker); * * Returns the number of slab objects which we shrunk. */ -int shrink_slab(unsigned long scanned, gfp_t gfp_mask, unsigned long lru_pages) +unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, + unsigned long lru_pages) { struct shrinker *shrinker; - int ret = 0; + unsigned long ret = 0; if (scanned == 0) scanned = SWAP_CLUSTER_MAX; @@ -306,9 +289,10 @@ static void handle_write_error(struct address_space *mapping, } /* - * pageout is called by shrink_list() for each dirty page. Calls ->writepage(). + * pageout is called by shrink_page_list() for each dirty page. + * Calls ->writepage(). */ -static pageout_t pageout(struct page *page, struct address_space *mapping) +pageout_t pageout(struct page *page, struct address_space *mapping) { /* * If the page is dirty, only perform writeback if that write @@ -376,7 +360,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping) return PAGE_CLEAN; } -static int remove_mapping(struct address_space *mapping, struct page *page) +int remove_mapping(struct address_space *mapping, struct page *page) { if (!mapping) return 0; /* truncate got there first */ @@ -414,14 +398,15 @@ cannot_free: } /* - * shrink_list adds the number of reclaimed pages to sc->nr_reclaimed + * shrink_page_list() returns the number of reclaimed pages */ -static int shrink_list(struct list_head *page_list, struct scan_control *sc) +static unsigned long shrink_page_list(struct list_head *page_list, + struct scan_control *sc) { LIST_HEAD(ret_pages); struct pagevec freed_pvec; int pgactivate = 0; - int reclaimed = 0; + unsigned long nr_reclaimed = 0; cond_resched(); @@ -464,12 +449,9 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc) * Anonymous process memory has backing store? * Try to allocate it some swap space here. */ - if (PageAnon(page) && !PageSwapCache(page)) { - if (!sc->may_swap) - goto keep_locked; + if (PageAnon(page) && !PageSwapCache(page)) if (!add_to_swap(page, GFP_ATOMIC)) goto activate_locked; - } #endif /* CONFIG_SWAP */ mapping = page_mapping(page); @@ -481,12 +463,6 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc) * processes. Try to unmap it here. */ if (page_mapped(page) && mapping) { - /* - * No unmapping if we do not swap - */ - if (!sc->may_swap) - goto keep_locked; - switch (try_to_unmap(page, 0)) { case SWAP_FAIL: goto activate_locked; @@ -561,7 +537,7 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc) free_it: unlock_page(page); - reclaimed++; + nr_reclaimed++; if (!pagevec_add(&freed_pvec, page)) __pagevec_release_nonlru(&freed_pvec); continue; @@ -579,483 +555,8 @@ keep: if (pagevec_count(&freed_pvec)) __pagevec_release_nonlru(&freed_pvec); mod_page_state(pgactivate, pgactivate); - sc->nr_reclaimed += reclaimed; - return reclaimed; -} - -#ifdef CONFIG_MIGRATION -static inline void move_to_lru(struct page *page) -{ - list_del(&page->lru); - if (PageActive(page)) { - /* - * lru_cache_add_active checks that - * the PG_active bit is off. - */ - ClearPageActive(page); - lru_cache_add_active(page); - } else { - lru_cache_add(page); - } - put_page(page); -} - -/* - * Add isolated pages on the list back to the LRU. - * - * returns the number of pages put back. - */ -int putback_lru_pages(struct list_head *l) -{ - struct page *page; - struct page *page2; - int count = 0; - - list_for_each_entry_safe(page, page2, l, lru) { - move_to_lru(page); - count++; - } - return count; -} - -/* - * Non migratable page - */ -int fail_migrate_page(struct page *newpage, struct page *page) -{ - return -EIO; -} -EXPORT_SYMBOL(fail_migrate_page); - -/* - * swapout a single page - * page is locked upon entry, unlocked on exit - */ -static int swap_page(struct page *page) -{ - struct address_space *mapping = page_mapping(page); - - if (page_mapped(page) && mapping) - if (try_to_unmap(page, 1) != SWAP_SUCCESS) - goto unlock_retry; - - if (PageDirty(page)) { - /* Page is dirty, try to write it out here */ - switch(pageout(page, mapping)) { - case PAGE_KEEP: - case PAGE_ACTIVATE: - goto unlock_retry; - - case PAGE_SUCCESS: - goto retry; - - case PAGE_CLEAN: - ; /* try to free the page below */ - } - } - - if (PagePrivate(page)) { - if (!try_to_release_page(page, GFP_KERNEL) || - (!mapping && page_count(page) == 1)) - goto unlock_retry; - } - - if (remove_mapping(mapping, page)) { - /* Success */ - unlock_page(page); - return 0; - } - -unlock_retry: - unlock_page(page); - -retry: - return -EAGAIN; -} -EXPORT_SYMBOL(swap_page); - -/* - * Page migration was first developed in the context of the memory hotplug - * project. The main authors of the migration code are: - * - * IWAMOTO Toshihiro <iwamoto@valinux.co.jp> - * Hirokazu Takahashi <taka@valinux.co.jp> - * Dave Hansen <haveblue@us.ibm.com> - * Christoph Lameter <clameter@sgi.com> - */ - -/* - * Remove references for a page and establish the new page with the correct - * basic settings to be able to stop accesses to the page. - */ -int migrate_page_remove_references(struct page *newpage, - struct page *page, int nr_refs) -{ - struct address_space *mapping = page_mapping(page); - struct page **radix_pointer; - - /* - * Avoid doing any of the following work if the page count - * indicates that the page is in use or truncate has removed - * the page. - */ - if (!mapping || page_mapcount(page) + nr_refs != page_count(page)) - return -EAGAIN; - - /* - * Establish swap ptes for anonymous pages or destroy pte - * maps for files. - * - * In order to reestablish file backed mappings the fault handlers - * will take the radix tree_lock which may then be used to stop - * processses from accessing this page until the new page is ready. - * - * A process accessing via a swap pte (an anonymous page) will take a - * page_lock on the old page which will block the process until the - * migration attempt is complete. At that time the PageSwapCache bit - * will be examined. If the page was migrated then the PageSwapCache - * bit will be clear and the operation to retrieve the page will be - * retried which will find the new page in the radix tree. Then a new - * direct mapping may be generated based on the radix tree contents. - * - * If the page was not migrated then the PageSwapCache bit - * is still set and the operation may continue. - */ - if (try_to_unmap(page, 1) == SWAP_FAIL) - /* A vma has VM_LOCKED set -> Permanent failure */ - return -EPERM; - - /* - * Give up if we were unable to remove all mappings. - */ - if (page_mapcount(page)) - return -EAGAIN; - - write_lock_irq(&mapping->tree_lock); - - radix_pointer = (struct page **)radix_tree_lookup_slot( - &mapping->page_tree, - page_index(page)); - - if (!page_mapping(page) || page_count(page) != nr_refs || - *radix_pointer != page) { - write_unlock_irq(&mapping->tree_lock); - return -EAGAIN; - } - - /* - * Now we know that no one else is looking at the page. - * - * Certain minimal information about a page must be available - * in order for other subsystems to properly handle the page if they - * find it through the radix tree update before we are finished - * copying the page. - */ - get_page(newpage); - newpage->index = page->index; - newpage->mapping = page->mapping; - if (PageSwapCache(page)) { - SetPageSwapCache(newpage); - set_page_private(newpage, page_private(page)); - } - - *radix_pointer = newpage; - __put_page(page); - write_unlock_irq(&mapping->tree_lock); - - return 0; -} -EXPORT_SYMBOL(migrate_page_remove_references); - -/* - * Copy the page to its new location - */ -void migrate_page_copy(struct page *newpage, struct page *page) -{ - copy_highpage(newpage, page); - - if (PageError(page)) - SetPageError(newpage); - if (PageReferenced(page)) - SetPageReferenced(newpage); - if (PageUptodate(page)) - SetPageUptodate(newpage); - if (PageActive(page)) - SetPageActive(newpage); - if (PageChecked(page)) - SetPageChecked(newpage); - if (PageMappedToDisk(page)) - SetPageMappedToDisk(newpage); - - if (PageDirty(page)) { - clear_page_dirty_for_io(page); - set_page_dirty(newpage); - } - - ClearPageSwapCache(page); - ClearPageActive(page); - ClearPagePrivate(page); - set_page_private(page, 0); - page->mapping = NULL; - - /* - * If any waiters have accumulated on the new page then - * wake them up. - */ - if (PageWriteback(newpage)) - end_page_writeback(newpage); -} -EXPORT_SYMBOL(migrate_page_copy); - -/* - * Common logic to directly migrate a single page suitable for - * pages that do not use PagePrivate. - * - * Pages are locked upon entry and exit. - */ -int migrate_page(struct page *newpage, struct page *page) -{ - int rc; - - BUG_ON(PageWriteback(page)); /* Writeback must be complete */ - - rc = migrate_page_remove_references(newpage, page, 2); - - if (rc) - return rc; - - migrate_page_copy(newpage, page); - - /* - * Remove auxiliary swap entries and replace - * them with real ptes. - * - * Note that a real pte entry will allow processes that are not - * waiting on the page lock to use the new page via the page tables - * before the new page is unlocked. - */ - remove_from_swap(newpage); - return 0; + return nr_reclaimed; } -EXPORT_SYMBOL(migrate_page); - -/* - * migrate_pages - * - * Two lists are passed to this function. The first list - * contains the pages isolated from the LRU to be migrated. - * The second list contains new pages that the pages isolated - * can be moved to. If the second list is NULL then all - * pages are swapped out. - * - * The function returns after 10 attempts or if no pages - * are movable anymore because to has become empty - * or no retryable pages exist anymore. - * - * Return: Number of pages not migrated when "to" ran empty. - */ -int migrate_pages(struct list_head *from, struct list_head *to, - struct list_head *moved, struct list_head *failed) -{ - int retry; - int nr_failed = 0; - int pass = 0; - struct page *page; - struct page *page2; - int swapwrite = current->flags & PF_SWAPWRITE; - int rc; - - if (!swapwrite) - current->flags |= PF_SWAPWRITE; - -redo: - retry = 0; - - list_for_each_entry_safe(page, page2, from, lru) { - struct page *newpage = NULL; - struct address_space *mapping; - - cond_resched(); - - rc = 0; - if (page_count(page) == 1) - /* page was freed from under us. So we are done. */ - goto next; - - if (to && list_empty(to)) - break; - - /* - * Skip locked pages during the first two passes to give the - * functions holding the lock time to release the page. Later we - * use lock_page() to have a higher chance of acquiring the - * lock. - */ - rc = -EAGAIN; - if (pass > 2) - lock_page(page); - else - if (TestSetPageLocked(page)) - goto next; - - /* - * Only wait on writeback if we have already done a pass where - * we we may have triggered writeouts for lots of pages. - */ - if (pass > 0) { - wait_on_page_writeback(page); - } else { - if (PageWriteback(page)) - goto unlock_page; - } - - /* - * Anonymous pages must have swap cache references otherwise - * the information contained in the page maps cannot be - * preserved. - */ - if (PageAnon(page) && !PageSwapCache(page)) { - if (!add_to_swap(page, GFP_KERNEL)) { - rc = -ENOMEM; - goto unlock_page; - } - } - - if (!to) { - rc = swap_page(page); - goto next; - } - - newpage = lru_to_page(to); - lock_page(newpage); - - /* - * Pages are properly locked and writeback is complete. - * Try to migrate the page. - */ - mapping = page_mapping(page); - if (!mapping) - goto unlock_both; - - if (mapping->a_ops->migratepage) { - /* - * Most pages have a mapping and most filesystems - * should provide a migration function. Anonymous - * pages are part of swap space which also has its - * own migration function. This is the most common - * path for page migration. - */ - rc = mapping->a_ops->migratepage(newpage, page); - goto unlock_both; - } - - /* - * Default handling if a filesystem does not provide - * a migration function. We can only migrate clean - * pages so try to write out any dirty pages first. - */ - if (PageDirty(page)) { - switch (pageout(page, mapping)) { - case PAGE_KEEP: - case PAGE_ACTIVATE: - goto unlock_both; - - case PAGE_SUCCESS: - unlock_page(newpage); - goto next; - - case PAGE_CLEAN: - ; /* try to migrate the page below */ - } - } - - /* - * Buffers are managed in a filesystem specific way. - * We must have no buffers or drop them. - */ - if (!page_has_buffers(page) || - try_to_release_page(page, GFP_KERNEL)) { - rc = migrate_page(newpage, page); - goto unlock_both; - } - - /* - * On early passes with mapped pages simply - * retry. There may be a lock held for some - * buffers that may go away. Later - * swap them out. - */ - if (pass > 4) { - /* - * Persistently unable to drop buffers..... As a - * measure of last resort we fall back to - * swap_page(). - */ - unlock_page(newpage); - newpage = NULL; - rc = swap_page(page); - goto next; - } - -unlock_both: - unlock_page(newpage); - -unlock_page: - unlock_page(page); - -next: - if (rc == -EAGAIN) { - retry++; - } else if (rc) { - /* Permanent failure */ - list_move(&page->lru, failed); - nr_failed++; - } else { - if (newpage) { - /* Successful migration. Return page to LRU */ - move_to_lru(newpage); - } - list_move(&page->lru, moved); - } - } - if (retry && pass++ < 10) - goto redo; - - if (!swapwrite) - current->flags &= ~PF_SWAPWRITE; - - return nr_failed + retry; -} - -/* - * Isolate one page from the LRU lists and put it on the - * indicated list with elevated refcount. - * - * Result: - * 0 = page not on LRU list - * 1 = page removed from LRU list and added to the specified list. - */ -int isolate_lru_page(struct page *page) -{ - int ret = 0; - - if (PageLRU(page)) { - struct zone *zone = page_zone(page); - spin_lock_irq(&zone->lru_lock); - if (TestClearPageLRU(page)) { - ret = 1; - get_page(page); - if (PageActive(page)) - del_page_from_active_list(zone, page); - else - del_page_from_inactive_list(zone, page); - } - spin_unlock_irq(&zone->lru_lock); - } - - return ret; -} -#endif /* * zone->lru_lock is heavily contended. Some of the functions that @@ -1074,32 +575,35 @@ int isolate_lru_page(struct page *page) * * returns how many pages were moved onto *@dst. */ -static int isolate_lru_pages(int nr_to_scan, struct list_head *src, - struct list_head *dst, int *scanned) +static unsigned long isolate_lru_pages(unsigned long nr_to_scan, + struct list_head *src, struct list_head *dst, + unsigned long *scanned) { - int nr_taken = 0; + unsigned long nr_taken = 0; struct page *page; - int scan = 0; + unsigned long scan; - while (scan++ < nr_to_scan && !list_empty(src)) { + for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) { + struct list_head *target; page = lru_to_page(src); prefetchw_prev_lru_page(page, src, flags); - if (!TestClearPageLRU(page)) - BUG(); + BUG_ON(!PageLRU(page)); + list_del(&page->lru); - if (get_page_testone(page)) { + target = src; + if (likely(get_page_unless_zero(page))) { /* - * It is being freed elsewhere + * Be careful not to clear PageLRU until after we're + * sure the page is not being freed elsewhere -- the + * page release code relies on it. */ - __put_page(page); - SetPageLRU(page); - list_add(&page->lru, src); - continue; - } else { - list_add(&page->lru, dst); + ClearPageLRU(page); + target = dst; nr_taken++; - } + } /* else it is being freed elsewhere */ + + list_add(&page->lru, target); } *scanned = scan; @@ -1107,23 +611,26 @@ static int isolate_lru_pages(int nr_to_scan, struct list_head *src, } /* - * shrink_cache() adds the number of pages reclaimed to sc->nr_reclaimed + * shrink_inactive_list() is a helper for shrink_zone(). It returns the number + * of reclaimed pages */ -static void shrink_cache(struct zone *zone, struct scan_control *sc) +static unsigned long shrink_inactive_list(unsigned long max_scan, + struct zone *zone, struct scan_control *sc) { LIST_HEAD(page_list); struct pagevec pvec; - int max_scan = sc->nr_to_scan; + unsigned long nr_scanned = 0; + unsigned long nr_reclaimed = 0; pagevec_init(&pvec, 1); lru_add_drain(); spin_lock_irq(&zone->lru_lock); - while (max_scan > 0) { + do { struct page *page; - int nr_taken; - int nr_scan; - int nr_freed; + unsigned long nr_taken; + unsigned long nr_scan; + unsigned long nr_freed; nr_taken = isolate_lru_pages(sc->swap_cluster_max, &zone->inactive_list, @@ -1132,12 +639,9 @@ static void shrink_cache(struct zone *zone, struct scan_control *sc) zone->pages_scanned += nr_scan; spin_unlock_irq(&zone->lru_lock); - if (nr_taken == 0) - goto done; - - max_scan -= nr_scan; - nr_freed = shrink_list(&page_list, sc); - + nr_scanned += nr_scan; + nr_freed = shrink_page_list(&page_list, sc); + nr_reclaimed += nr_freed; local_irq_disable(); if (current_is_kswapd()) { __mod_page_state_zone(zone, pgscan_kswapd, nr_scan); @@ -1146,14 +650,17 @@ static void shrink_cache(struct zone *zone, struct scan_control *sc) __mod_page_state_zone(zone, pgscan_direct, nr_scan); __mod_page_state_zone(zone, pgsteal, nr_freed); + if (nr_taken == 0) + goto done; + spin_lock(&zone->lru_lock); /* * Put back any unfreeable pages. */ while (!list_empty(&page_list)) { page = lru_to_page(&page_list); - if (TestSetPageLRU(page)) - BUG(); + BUG_ON(PageLRU(page)); + SetPageLRU(page); list_del(&page->lru); if (PageActive(page)) add_page_to_active_list(zone, page); @@ -1165,10 +672,12 @@ static void shrink_cache(struct zone *zone, struct scan_control *sc) spin_lock_irq(&zone->lru_lock); } } - } - spin_unlock_irq(&zone->lru_lock); + } while (nr_scanned < max_scan); + spin_unlock(&zone->lru_lock); done: + local_irq_enable(); pagevec_release(&pvec); + return nr_reclaimed; } /* @@ -1188,13 +697,12 @@ done: * The downside is that we have to touch page->_count against each page. * But we had to alter page->flags anyway. */ -static void -refill_inactive_zone(struct zone *zone, struct scan_control *sc) +static void shrink_active_list(unsigned long nr_pages, struct zone *zone, + struct scan_control *sc) { - int pgmoved; + unsigned long pgmoved; int pgdeactivate = 0; - int pgscanned; - int nr_pages = sc->nr_to_scan; + unsigned long pgscanned; LIST_HEAD(l_hold); /* The pages which were snipped off */ LIST_HEAD(l_inactive); /* Pages to go onto the inactive_list */ LIST_HEAD(l_active); /* Pages to go onto the active_list */ @@ -1202,7 +710,7 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc) struct pagevec pvec; int reclaim_mapped = 0; - if (unlikely(sc->may_swap)) { + if (sc->may_swap) { long mapped_ratio; long distress; long swap_tendency; @@ -1272,10 +780,11 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc) while (!list_empty(&l_inactive)) { page = lru_to_page(&l_inactive); prefetchw_prev_lru_page(page, &l_inactive, flags); - if (TestSetPageLRU(page)) - BUG(); - if (!TestClearPageActive(page)) - BUG(); + BUG_ON(PageLRU(page)); + SetPageLRU(page); + BUG_ON(!PageActive(page)); + ClearPageActive(page); + list_move(&page->lru, &zone->inactive_list); pgmoved++; if (!pagevec_add(&pvec, page)) { @@ -1301,8 +810,8 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc) while (!list_empty(&l_active)) { page = lru_to_page(&l_active); prefetchw_prev_lru_page(page, &l_active, flags); - if (TestSetPageLRU(page)) - BUG(); + BUG_ON(PageLRU(page)); + SetPageLRU(page); BUG_ON(!PageActive(page)); list_move(&page->lru, &zone->active_list); pgmoved++; @@ -1327,11 +836,13 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc) /* * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. */ -static void -shrink_zone(struct zone *zone, struct scan_control *sc) +static unsigned long shrink_zone(int priority, struct zone *zone, + struct scan_control *sc) { unsigned long nr_active; unsigned long nr_inactive; + unsigned long nr_to_scan; + unsigned long nr_reclaimed = 0; atomic_inc(&zone->reclaim_in_progress); @@ -1339,14 +850,14 @@ shrink_zone(struct zone *zone, struct scan_control *sc) * Add one to `nr_to_scan' just to make sure that the kernel will * slowly sift through the active list. */ - zone->nr_scan_active += (zone->nr_active >> sc->priority) + 1; + zone->nr_scan_active += (zone->nr_active >> priority) + 1; nr_active = zone->nr_scan_active; if (nr_active >= sc->swap_cluster_max) zone->nr_scan_active = 0; else nr_active = 0; - zone->nr_scan_inactive += (zone->nr_inactive >> sc->priority) + 1; + zone->nr_scan_inactive += (zone->nr_inactive >> priority) + 1; nr_inactive = zone->nr_scan_inactive; if (nr_inactive >= sc->swap_cluster_max) zone->nr_scan_inactive = 0; @@ -1355,23 +866,25 @@ shrink_zone(struct zone *zone, struct scan_control *sc) while (nr_active || nr_inactive) { if (nr_active) { - sc->nr_to_scan = min(nr_active, + nr_to_scan = min(nr_active, (unsigned long)sc->swap_cluster_max); - nr_active -= sc->nr_to_scan; - refill_inactive_zone(zone, sc); + nr_active -= nr_to_scan; + shrink_active_list(nr_to_scan, zone, sc); } if (nr_inactive) { - sc->nr_to_scan = min(nr_inactive, + nr_to_scan = min(nr_inactive, (unsigned long)sc->swap_cluster_max); - nr_inactive -= sc->nr_to_scan; - shrink_cache(zone, sc); + nr_inactive -= nr_to_scan; + nr_reclaimed += shrink_inactive_list(nr_to_scan, zone, + sc); } } throttle_vm_writeout(); atomic_dec(&zone->reclaim_in_progress); + return nr_reclaimed; } /* @@ -1390,9 +903,10 @@ shrink_zone(struct zone *zone, struct scan_control *sc) * If a zone is deemed to be full of pinned pages then just give it a light * scan then give up on it. */ -static void -shrink_caches(struct zone **zones, struct scan_control *sc) +static unsigned long shrink_zones(int priority, struct zone **zones, + struct scan_control *sc) { + unsigned long nr_reclaimed = 0; int i; for (i = 0; zones[i] != NULL; i++) { @@ -1404,15 +918,16 @@ shrink_caches(struct zone **zones, struct scan_control *sc) if (!cpuset_zone_allowed(zone, __GFP_HARDWALL)) continue; - zone->temp_priority = sc->priority; - if (zone->prev_priority > sc->priority) - zone->prev_priority = sc->priority; + zone->temp_priority = priority; + if (zone->prev_priority > priority) + zone->prev_priority = priority; - if (zone->all_unreclaimable && sc->priority != DEF_PRIORITY) + if (zone->all_unreclaimable && priority != DEF_PRIORITY) continue; /* Let kswapd poll it */ - shrink_zone(zone, sc); + nr_reclaimed += shrink_zone(priority, zone, sc); } + return nr_reclaimed; } /* @@ -1428,19 +943,21 @@ shrink_caches(struct zone **zones, struct scan_control *sc) * holds filesystem locks which prevent writeout this might not work, and the * allocation attempt will fail. */ -int try_to_free_pages(struct zone **zones, gfp_t gfp_mask) +unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask) { int priority; int ret = 0; - int total_scanned = 0, total_reclaimed = 0; + unsigned long total_scanned = 0; + unsigned long nr_reclaimed = 0; struct reclaim_state *reclaim_state = current->reclaim_state; - struct scan_control sc; unsigned long lru_pages = 0; int i; - - sc.gfp_mask = gfp_mask; - sc.may_writepage = !laptop_mode; - sc.may_swap = 1; + struct scan_control sc = { + .gfp_mask = gfp_mask, + .may_writepage = !laptop_mode, + .swap_cluster_max = SWAP_CLUSTER_MAX, + .may_swap = 1, + }; inc_page_state(allocstall); @@ -1457,20 +974,16 @@ int try_to_free_pages(struct zone **zones, gfp_t gfp_mask) for (priority = DEF_PRIORITY; priority >= 0; priority--) { sc.nr_mapped = read_page_state(nr_mapped); sc.nr_scanned = 0; - sc.nr_reclaimed = 0; - sc.priority = priority; - sc.swap_cluster_max = SWAP_CLUSTER_MAX; if (!priority) disable_swap_token(); - shrink_caches(zones, &sc); + nr_reclaimed += shrink_zones(priority, zones, &sc); shrink_slab(sc.nr_scanned, gfp_mask, lru_pages); if (reclaim_state) { - sc.nr_reclaimed += reclaim_state->reclaimed_slab; + nr_reclaimed += reclaim_state->reclaimed_slab; reclaim_state->reclaimed_slab = 0; } total_scanned += sc.nr_scanned; - total_reclaimed += sc.nr_reclaimed; - if (total_reclaimed >= sc.swap_cluster_max) { + if (nr_reclaimed >= sc.swap_cluster_max) { ret = 1; goto out; } @@ -1482,7 +995,8 @@ int try_to_free_pages(struct zone **zones, gfp_t gfp_mask) * that's undesirable in laptop mode, where we *want* lumpy * writeout. So in laptop mode, write out the whole world. */ - if (total_scanned > sc.swap_cluster_max + sc.swap_cluster_max/2) { + if (total_scanned > sc.swap_cluster_max + + sc.swap_cluster_max / 2) { wakeup_pdflush(laptop_mode ? 0 : total_scanned); sc.may_writepage = 1; } @@ -1528,22 +1042,26 @@ out: * the page allocator fallback scheme to ensure that aging of pages is balanced * across the zones. */ -static int balance_pgdat(pg_data_t *pgdat, int nr_pages, int order) +static unsigned long balance_pgdat(pg_data_t *pgdat, unsigned long nr_pages, + int order) { - int to_free = nr_pages; + unsigned long to_free = nr_pages; int all_zones_ok; int priority; int i; - int total_scanned, total_reclaimed; + unsigned long total_scanned; + unsigned long nr_reclaimed; struct reclaim_state *reclaim_state = current->reclaim_state; - struct scan_control sc; + struct scan_control sc = { + .gfp_mask = GFP_KERNEL, + .may_swap = 1, + .swap_cluster_max = nr_pages ? nr_pages : SWAP_CLUSTER_MAX, + }; loop_again: total_scanned = 0; - total_reclaimed = 0; - sc.gfp_mask = GFP_KERNEL; - sc.may_writepage = !laptop_mode; - sc.may_swap = 1; + nr_reclaimed = 0; + sc.may_writepage = !laptop_mode, sc.nr_mapped = read_page_state(nr_mapped); inc_page_state(pageoutrun); @@ -1624,15 +1142,11 @@ scan: if (zone->prev_priority > priority) zone->prev_priority = priority; sc.nr_scanned = 0; - sc.nr_reclaimed = 0; - sc.priority = priority; - sc.swap_cluster_max = nr_pages? nr_pages : SWAP_CLUSTER_MAX; - shrink_zone(zone, &sc); + nr_reclaimed += shrink_zone(priority, zone, &sc); reclaim_state->reclaimed_slab = 0; nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, lru_pages); - sc.nr_reclaimed += reclaim_state->reclaimed_slab; - total_reclaimed += sc.nr_reclaimed; + nr_reclaimed += reclaim_state->reclaimed_slab; total_scanned += sc.nr_scanned; if (zone->all_unreclaimable) continue; @@ -1645,10 +1159,10 @@ scan: * even in laptop mode */ if (total_scanned > SWAP_CLUSTER_MAX * 2 && - total_scanned > total_reclaimed+total_reclaimed/2) + total_scanned > nr_reclaimed + nr_reclaimed / 2) sc.may_writepage = 1; } - if (nr_pages && to_free > total_reclaimed) + if (nr_pages && to_free > nr_reclaimed) continue; /* swsusp: need to do more work */ if (all_zones_ok) break; /* kswapd: all done */ @@ -1665,7 +1179,7 @@ scan: * matches the direct reclaim path behaviour in terms of impact * on zone->*_priority. */ - if ((total_reclaimed >= SWAP_CLUSTER_MAX) && (!nr_pages)) + if ((nr_reclaimed >= SWAP_CLUSTER_MAX) && !nr_pages) break; } out: @@ -1679,7 +1193,7 @@ out: goto loop_again; } - return total_reclaimed; + return nr_reclaimed; } /* @@ -1779,24 +1293,31 @@ void wakeup_kswapd(struct zone *zone, int order) * Try to free `nr_pages' of memory, system-wide. Returns the number of freed * pages. */ -int shrink_all_memory(int nr_pages) +unsigned long shrink_all_memory(unsigned long nr_pages) { pg_data_t *pgdat; - int nr_to_free = nr_pages; - int ret = 0; + unsigned long nr_to_free = nr_pages; + unsigned long ret = 0; + unsigned retry = 2; struct reclaim_state reclaim_state = { .reclaimed_slab = 0, }; current->reclaim_state = &reclaim_state; +repeat: for_each_pgdat(pgdat) { - int freed; + unsigned long freed; + freed = balance_pgdat(pgdat, nr_to_free, 0); ret += freed; nr_to_free -= freed; - if (nr_to_free <= 0) + if ((long)nr_to_free <= 0) break; } + if (retry-- && ret < nr_pages) { + blk_congestion_wait(WRITE, HZ/5); + goto repeat; + } current->reclaim_state = NULL; return ret; } @@ -1808,8 +1329,7 @@ int shrink_all_memory(int nr_pages) away, we get changed to run anywhere: as the first one comes back, restore their cpu bindings. */ static int __devinit cpu_callback(struct notifier_block *nfb, - unsigned long action, - void *hcpu) + unsigned long action, void *hcpu) { pg_data_t *pgdat; cpumask_t mask; @@ -1829,10 +1349,15 @@ static int __devinit cpu_callback(struct notifier_block *nfb, static int __init kswapd_init(void) { pg_data_t *pgdat; + swap_setup(); - for_each_pgdat(pgdat) - pgdat->kswapd - = find_task_by_pid(kernel_thread(kswapd, pgdat, CLONE_KERNEL)); + for_each_pgdat(pgdat) { + pid_t pid; + + pid = kernel_thread(kswapd, pgdat, CLONE_KERNEL); + BUG_ON(pid < 0); + pgdat->kswapd = find_task_by_pid(pid); + } total_memory = nr_free_pagecache_pages(); hotcpu_notifier(cpu_callback, 0); return 0; @@ -1874,46 +1399,24 @@ int zone_reclaim_interval __read_mostly = 30*HZ; /* * Try to free up some pages from this zone through reclaim. */ -int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) +static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) { - int nr_pages; + /* Minimum pages needed in order to stay on node */ + const unsigned long nr_pages = 1 << order; struct task_struct *p = current; struct reclaim_state reclaim_state; - struct scan_control sc; - cpumask_t mask; - int node_id; - - if (time_before(jiffies, - zone->last_unsuccessful_zone_reclaim + zone_reclaim_interval)) - return 0; - - if (!(gfp_mask & __GFP_WAIT) || - zone->all_unreclaimable || - atomic_read(&zone->reclaim_in_progress) > 0 || - (p->flags & PF_MEMALLOC)) - return 0; - - node_id = zone->zone_pgdat->node_id; - mask = node_to_cpumask(node_id); - if (!cpus_empty(mask) && node_id != numa_node_id()) - return 0; - - sc.may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE); - sc.may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP); - sc.nr_scanned = 0; - sc.nr_reclaimed = 0; - sc.priority = ZONE_RECLAIM_PRIORITY + 1; - sc.nr_mapped = read_page_state(nr_mapped); - sc.gfp_mask = gfp_mask; + int priority; + unsigned long nr_reclaimed = 0; + struct scan_control sc = { + .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), + .may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP), + .nr_mapped = read_page_state(nr_mapped), + .swap_cluster_max = max_t(unsigned long, nr_pages, + SWAP_CLUSTER_MAX), + .gfp_mask = gfp_mask, + }; disable_swap_token(); - - nr_pages = 1 << order; - if (nr_pages > SWAP_CLUSTER_MAX) - sc.swap_cluster_max = nr_pages; - else - sc.swap_cluster_max = SWAP_CLUSTER_MAX; - cond_resched(); /* * We need to be able to allocate from the reserves for RECLAIM_SWAP @@ -1928,17 +1431,20 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) * Free memory by calling shrink zone with increasing priorities * until we have enough memory freed. */ + priority = ZONE_RECLAIM_PRIORITY; do { - sc.priority--; - shrink_zone(zone, &sc); + nr_reclaimed += shrink_zone(priority, zone, &sc); + priority--; + } while (priority >= 0 && nr_reclaimed < nr_pages); - } while (sc.nr_reclaimed < nr_pages && sc.priority > 0); - - if (sc.nr_reclaimed < nr_pages && (zone_reclaim_mode & RECLAIM_SLAB)) { + if (nr_reclaimed < nr_pages && (zone_reclaim_mode & RECLAIM_SLAB)) { /* - * shrink_slab does not currently allow us to determine - * how many pages were freed in the zone. So we just - * shake the slab and then go offnode for a single allocation. + * shrink_slab() does not currently allow us to determine how + * many pages were freed in this zone. So we just shake the slab + * a bit and then go off node for this particular allocation + * despite possibly having freed enough memory to allocate in + * this zone. If we freed local memory then the next + * allocations will be local again. * * shrink_slab will free memory on all zones and may take * a long time. @@ -1949,10 +1455,54 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) p->reclaim_state = NULL; current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE); - if (sc.nr_reclaimed == 0) + if (nr_reclaimed == 0) { + /* + * We were unable to reclaim enough pages to stay on node. We + * now allow off node accesses for a certain time period before + * trying again to reclaim pages from the local zone. + */ zone->last_unsuccessful_zone_reclaim = jiffies; + } - return sc.nr_reclaimed >= nr_pages; + return nr_reclaimed >= nr_pages; } -#endif +int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) +{ + cpumask_t mask; + int node_id; + + /* + * Do not reclaim if there was a recent unsuccessful attempt at zone + * reclaim. In that case we let allocations go off node for the + * zone_reclaim_interval. Otherwise we would scan for each off-node + * page allocation. + */ + if (time_before(jiffies, + zone->last_unsuccessful_zone_reclaim + zone_reclaim_interval)) + return 0; + + /* + * Avoid concurrent zone reclaims, do not reclaim in a zone that does + * not have reclaimable pages and if we should not delay the allocation + * then do not scan. + */ + if (!(gfp_mask & __GFP_WAIT) || + zone->all_unreclaimable || + atomic_read(&zone->reclaim_in_progress) > 0 || + (current->flags & PF_MEMALLOC)) + return 0; + + /* + * Only run zone reclaim on the local zone or on zones that do not + * have associated processors. This will favor the local processor + * over remote processors and spread off node memory allocations + * as wide as possible. + */ + node_id = zone->zone_pgdat->node_id; + mask = node_to_cpumask(node_id); + if (!cpus_empty(mask) && node_id != numa_node_id()) + return 0; + return __zone_reclaim(zone, gfp_mask, order); +} +#endif diff --git a/security/keys/process_keys.c b/security/keys/process_keys.c index 74cb79eb917..f6940618e34 100644 --- a/security/keys/process_keys.c +++ b/security/keys/process_keys.c @@ -16,11 +16,12 @@ #include <linux/keyctl.h> #include <linux/fs.h> #include <linux/err.h> +#include <linux/mutex.h> #include <asm/uaccess.h> #include "internal.h" /* session keyring create vs join semaphore */ -static DECLARE_MUTEX(key_session_sem); +static DEFINE_MUTEX(key_session_mutex); /* the root user's tracking struct */ struct key_user root_key_user = { @@ -711,7 +712,7 @@ long join_session_keyring(const char *name) } /* allow the user to join or create a named keyring */ - down(&key_session_sem); + mutex_lock(&key_session_mutex); /* look for an existing keyring of this name */ keyring = find_keyring_by_name(name, 0); @@ -737,7 +738,7 @@ long join_session_keyring(const char *name) key_put(keyring); error2: - up(&key_session_sem); + mutex_unlock(&key_session_mutex); error: return ret; diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 5b16196f282..ccaf988f372 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -117,6 +117,8 @@ static struct security_operations *secondary_ops = NULL; static LIST_HEAD(superblock_security_head); static DEFINE_SPINLOCK(sb_security_lock); +static kmem_cache_t *sel_inode_cache; + /* Allocate and free functions for each kind of security blob. */ static int task_alloc_security(struct task_struct *task) @@ -146,10 +148,11 @@ static int inode_alloc_security(struct inode *inode) struct task_security_struct *tsec = current->security; struct inode_security_struct *isec; - isec = kzalloc(sizeof(struct inode_security_struct), GFP_KERNEL); + isec = kmem_cache_alloc(sel_inode_cache, SLAB_KERNEL); if (!isec) return -ENOMEM; + memset(isec, 0, sizeof(*isec)); init_MUTEX(&isec->sem); INIT_LIST_HEAD(&isec->list); isec->inode = inode; @@ -172,7 +175,7 @@ static void inode_free_security(struct inode *inode) spin_unlock(&sbsec->isec_lock); inode->i_security = NULL; - kfree(isec); + kmem_cache_free(sel_inode_cache, isec); } static int file_alloc_security(struct file *file) @@ -1929,7 +1932,6 @@ static int selinux_inode_init_security(struct inode *inode, struct inode *dir, struct task_security_struct *tsec; struct inode_security_struct *dsec; struct superblock_security_struct *sbsec; - struct inode_security_struct *isec; u32 newsid, clen; int rc; char *namep = NULL, *context; @@ -1937,7 +1939,6 @@ static int selinux_inode_init_security(struct inode *inode, struct inode *dir, tsec = current->security; dsec = dir->i_security; sbsec = dir->i_sb->s_security; - isec = inode->i_security; if (tsec->create_sid && sbsec->behavior != SECURITY_FS_USE_MNTPOINT) { newsid = tsec->create_sid; @@ -1957,7 +1958,7 @@ static int selinux_inode_init_security(struct inode *inode, struct inode *dir, inode_security_set_sid(inode, newsid); - if (sbsec->behavior == SECURITY_FS_USE_MNTPOINT) + if (!ss_initialized || sbsec->behavior == SECURITY_FS_USE_MNTPOINT) return -EOPNOTSUPP; if (name) { @@ -4408,6 +4409,9 @@ static __init int selinux_init(void) tsec = current->security; tsec->osid = tsec->sid = SECINITSID_KERNEL; + sel_inode_cache = kmem_cache_create("selinux_inode_security", + sizeof(struct inode_security_struct), + 0, SLAB_PANIC, NULL, NULL); avc_init(); original_ops = secondary_ops = security_ops; diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c index b5fa02d17b1..f5d78365488 100644 --- a/security/selinux/selinuxfs.c +++ b/security/selinux/selinuxfs.c @@ -15,6 +15,7 @@ #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/fs.h> +#include <linux/mutex.h> #include <linux/init.h> #include <linux/string.h> #include <linux/security.h> @@ -44,7 +45,7 @@ static int __init checkreqprot_setup(char *str) __setup("checkreqprot=", checkreqprot_setup); -static DECLARE_MUTEX(sel_sem); +static DEFINE_MUTEX(sel_mutex); /* global data for booleans */ static struct dentry *bool_dir = NULL; @@ -230,7 +231,7 @@ static ssize_t sel_write_load(struct file * file, const char __user * buf, ssize_t length; void *data = NULL; - down(&sel_sem); + mutex_lock(&sel_mutex); length = task_has_security(current, SECURITY__LOAD_POLICY); if (length) @@ -262,7 +263,7 @@ static ssize_t sel_write_load(struct file * file, const char __user * buf, else length = count; out: - up(&sel_sem); + mutex_unlock(&sel_mutex); vfree(data); return length; } @@ -709,12 +710,11 @@ static ssize_t sel_read_bool(struct file *filep, char __user *buf, { char *page = NULL; ssize_t length; - ssize_t end; ssize_t ret; int cur_enforcing; struct inode *inode; - down(&sel_sem); + mutex_lock(&sel_mutex); ret = -EFAULT; @@ -740,26 +740,9 @@ static ssize_t sel_read_bool(struct file *filep, char __user *buf, length = scnprintf(page, PAGE_SIZE, "%d %d", cur_enforcing, bool_pending_values[inode->i_ino - BOOL_INO_OFFSET]); - if (length < 0) { - ret = length; - goto out; - } - - if (*ppos >= length) { - ret = 0; - goto out; - } - if (count + *ppos > length) - count = length - *ppos; - end = count + *ppos; - if (copy_to_user(buf, (char *) page + *ppos, count)) { - ret = -EFAULT; - goto out; - } - *ppos = end; - ret = count; + ret = simple_read_from_buffer(buf, count, ppos, page, length); out: - up(&sel_sem); + mutex_unlock(&sel_mutex); if (page) free_page((unsigned long)page); return ret; @@ -773,7 +756,7 @@ static ssize_t sel_write_bool(struct file *filep, const char __user *buf, int new_value; struct inode *inode; - down(&sel_sem); + mutex_lock(&sel_mutex); length = task_has_security(current, SECURITY__SETBOOL); if (length) @@ -812,7 +795,7 @@ static ssize_t sel_write_bool(struct file *filep, const char __user *buf, length = count; out: - up(&sel_sem); + mutex_unlock(&sel_mutex); if (page) free_page((unsigned long) page); return length; @@ -831,7 +814,7 @@ static ssize_t sel_commit_bools_write(struct file *filep, ssize_t length = -EFAULT; int new_value; - down(&sel_sem); + mutex_lock(&sel_mutex); length = task_has_security(current, SECURITY__SETBOOL); if (length) @@ -869,7 +852,7 @@ static ssize_t sel_commit_bools_write(struct file *filep, length = count; out: - up(&sel_sem); + mutex_unlock(&sel_mutex); if (page) free_page((unsigned long) page); return length; @@ -987,7 +970,7 @@ out: return ret; err: kfree(values); - d_genocide(dir); + sel_remove_bools(dir); ret = -ENOMEM; goto out; } @@ -1168,37 +1151,38 @@ static int sel_make_avc_files(struct dentry *dir) dentry = d_alloc_name(dir, files[i].name); if (!dentry) { ret = -ENOMEM; - goto err; + goto out; } inode = sel_make_inode(dir->d_sb, S_IFREG|files[i].mode); if (!inode) { ret = -ENOMEM; - goto err; + goto out; } inode->i_fop = files[i].ops; d_add(dentry, inode); } out: return ret; -err: - d_genocide(dir); - goto out; } -static int sel_make_dir(struct super_block *sb, struct dentry *dentry) +static int sel_make_dir(struct inode *dir, struct dentry *dentry) { int ret = 0; struct inode *inode; - inode = sel_make_inode(sb, S_IFDIR | S_IRUGO | S_IXUGO); + inode = sel_make_inode(dir->i_sb, S_IFDIR | S_IRUGO | S_IXUGO); if (!inode) { ret = -ENOMEM; goto out; } inode->i_op = &simple_dir_inode_operations; inode->i_fop = &simple_dir_operations; + /* directory inodes start off with i_nlink == 2 (for "." entry) */ + inode->i_nlink++; d_add(dentry, inode); + /* bump link count on parent directory, too */ + dir->i_nlink++; out: return ret; } @@ -1207,7 +1191,7 @@ static int sel_fill_super(struct super_block * sb, void * data, int silent) { int ret; struct dentry *dentry; - struct inode *inode; + struct inode *inode, *root_inode; struct inode_security_struct *isec; static struct tree_descr selinux_files[] = { @@ -1228,30 +1212,33 @@ static int sel_fill_super(struct super_block * sb, void * data, int silent) }; ret = simple_fill_super(sb, SELINUX_MAGIC, selinux_files); if (ret) - return ret; + goto err; + + root_inode = sb->s_root->d_inode; dentry = d_alloc_name(sb->s_root, BOOL_DIR_NAME); - if (!dentry) - return -ENOMEM; + if (!dentry) { + ret = -ENOMEM; + goto err; + } - inode = sel_make_inode(sb, S_IFDIR | S_IRUGO | S_IXUGO); - if (!inode) - goto out; - inode->i_op = &simple_dir_inode_operations; - inode->i_fop = &simple_dir_operations; - d_add(dentry, inode); - bool_dir = dentry; - ret = sel_make_bools(); + ret = sel_make_dir(root_inode, dentry); if (ret) - goto out; + goto err; + + bool_dir = dentry; dentry = d_alloc_name(sb->s_root, NULL_FILE_NAME); - if (!dentry) - return -ENOMEM; + if (!dentry) { + ret = -ENOMEM; + goto err; + } inode = sel_make_inode(sb, S_IFCHR | S_IRUGO | S_IWUGO); - if (!inode) - goto out; + if (!inode) { + ret = -ENOMEM; + goto err; + } isec = (struct inode_security_struct*)inode->i_security; isec->sid = SECINITSID_DEVNULL; isec->sclass = SECCLASS_CHR_FILE; @@ -1262,22 +1249,23 @@ static int sel_fill_super(struct super_block * sb, void * data, int silent) selinux_null = dentry; dentry = d_alloc_name(sb->s_root, "avc"); - if (!dentry) - return -ENOMEM; + if (!dentry) { + ret = -ENOMEM; + goto err; + } - ret = sel_make_dir(sb, dentry); + ret = sel_make_dir(root_inode, dentry); if (ret) - goto out; + goto err; ret = sel_make_avc_files(dentry); if (ret) - goto out; - - return 0; + goto err; out: - dput(dentry); + return ret; +err: printk(KERN_ERR "%s: failed while creating inodes\n", __FUNCTION__); - return -ENOMEM; + goto out; } static struct super_block *sel_get_sb(struct file_system_type *fs_type, diff --git a/security/selinux/ss/services.c b/security/selinux/ss/services.c index 8a764928ff4..63e0b7f29cb 100644 --- a/security/selinux/ss/services.c +++ b/security/selinux/ss/services.c @@ -27,7 +27,8 @@ #include <linux/in.h> #include <linux/sched.h> #include <linux/audit.h> -#include <asm/semaphore.h> +#include <linux/mutex.h> + #include "flask.h" #include "avc.h" #include "avc_ss.h" @@ -48,9 +49,9 @@ static DEFINE_RWLOCK(policy_rwlock); #define POLICY_RDUNLOCK read_unlock(&policy_rwlock) #define POLICY_WRUNLOCK write_unlock_irq(&policy_rwlock) -static DECLARE_MUTEX(load_sem); -#define LOAD_LOCK down(&load_sem) -#define LOAD_UNLOCK up(&load_sem) +static DEFINE_MUTEX(load_mutex); +#define LOAD_LOCK mutex_lock(&load_mutex) +#define LOAD_UNLOCK mutex_unlock(&load_mutex) static struct sidtab sidtab; struct policydb policydb; |