From a94e88cdd8057fe8ea84bbb6d9a89a823c7bc49b Mon Sep 17 00:00:00 2001 From: Lv Zheng Date: Fri, 4 Apr 2014 12:39:11 +0800 Subject: ACPICA: Tables: Avoid SSDT installation with acpi_gbl_disable_ssdt_table_load. It is reported that when acpi_gbl_disable_ssdt_table_load is specified, user still can see it installed into /sys/firmware/acpi/tables on Linux boxes. This is because the option only stops table "loading", but doesn't stop table "installing", thus it is still in the acpi_gbl_root_table_list. With previous cleanups, it is possible to prevent SSDT installations to make it not such confusing. The global variable is also renamed. Lv Zheng. Signed-off-by: Lv Zheng [rjw: Subject] Signed-off-by: Rafael J. Wysocki --- Documentation/kernel-parameters.txt | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'Documentation') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 03e50b4883a..fbb58d790ec 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -237,7 +237,15 @@ bytes respectively. Such letter suffixes can also be entirely omitted. This feature is enabled by default. This option allows to turn off the feature. - acpi_no_auto_ssdt [HW,ACPI] Disable automatic loading of SSDT + acpi_no_static_ssdt [HW,ACPI] + Disable installation of static SSDTs at early boot time + By default, SSDTs contained in the RSDT/XSDT will be + installed automatically and they will appear under + /sys/firmware/acpi/tables. + This option turns off this feature. + Note that specifying this option does not affect + dynamic table installation which will install SSDT + tables to /sys/firmware/acpi/tables/dynamic. acpica_no_return_repair [HW, ACPI] Disable AML predefined validation mechanism -- cgit v1.2.3-70-g09d2 From 27e289dce29764e488c1e13e9aa6950cad1f4aab Mon Sep 17 00:00:00 2001 From: Stratos Karafotis Date: Fri, 25 Apr 2014 23:15:23 +0300 Subject: cpufreq: Introduce macros for cpufreq_frequency_table iteration Many cpufreq drivers need to iterate over the cpufreq_frequency_table for various tasks. This patch introduces two macros which can be used for iteration over cpufreq_frequency_table keeping a common coding style across drivers: - cpufreq_for_each_entry: iterate over each entry of the table - cpufreq_for_each_valid_entry: iterate over each entry that contains a valid frequency. It should have no functional changes. Signed-off-by: Stratos Karafotis Acked-by: Lad, Prabhakar Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- Documentation/cpu-freq/cpu-drivers.txt | 19 +++++++++++++++++++ drivers/cpufreq/cpufreq.c | 11 +++++++++++ include/linux/cpufreq.h | 21 +++++++++++++++++++++ 3 files changed, 51 insertions(+) (limited to 'Documentation') diff --git a/Documentation/cpu-freq/cpu-drivers.txt b/Documentation/cpu-freq/cpu-drivers.txt index 48da5fdcb9f..b045fe54986 100644 --- a/Documentation/cpu-freq/cpu-drivers.txt +++ b/Documentation/cpu-freq/cpu-drivers.txt @@ -228,3 +228,22 @@ is the corresponding frequency table helper for the ->target stage. Just pass the values to this function, and the unsigned int index returns the number of the frequency table entry which contains the frequency the CPU shall be set to. + +The following macros can be used as iterators over cpufreq_frequency_table: + +cpufreq_for_each_entry(pos, table) - iterates over all entries of frequency +table. + +cpufreq-for_each_valid_entry(pos, table) - iterates over all entries, +excluding CPUFREQ_ENTRY_INVALID frequencies. +Use arguments "pos" - a cpufreq_frequency_table * as a loop cursor and +"table" - the cpufreq_frequency_table * you want to iterate over. + +For example: + + struct cpufreq_frequency_table *pos, *driver_freq_table; + + cpufreq_for_each_entry(pos, driver_freq_table) { + /* Do something with pos */ + pos->frequency = ... + } diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index abda6609d3e..a517da996aa 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -237,6 +237,17 @@ void cpufreq_cpu_put(struct cpufreq_policy *policy) } EXPORT_SYMBOL_GPL(cpufreq_cpu_put); +bool cpufreq_next_valid(struct cpufreq_frequency_table **pos) +{ + while ((*pos)->frequency != CPUFREQ_TABLE_END) + if ((*pos)->frequency != CPUFREQ_ENTRY_INVALID) + return true; + else + (*pos)++; + return false; +} +EXPORT_SYMBOL_GPL(cpufreq_next_valid); + /********************************************************************* * EXTERNALLY AFFECTING FREQUENCY CHANGES * *********************************************************************/ diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 5ae5100c1f2..77a5fa19150 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -468,6 +468,27 @@ struct cpufreq_frequency_table { * order */ }; +bool cpufreq_next_valid(struct cpufreq_frequency_table **pos); + +/* + * cpufreq_for_each_entry - iterate over a cpufreq_frequency_table + * @pos: the cpufreq_frequency_table * to use as a loop cursor. + * @table: the cpufreq_frequency_table * to iterate over. + */ + +#define cpufreq_for_each_entry(pos, table) \ + for (pos = table; pos->frequency != CPUFREQ_TABLE_END; pos++) + +/* + * cpufreq_for_each_valid_entry - iterate over a cpufreq_frequency_table + * excluding CPUFREQ_ENTRY_INVALID frequencies. + * @pos: the cpufreq_frequency_table * to use as a loop cursor. + * @table: the cpufreq_frequency_table * to iterate over. + */ + +#define cpufreq_for_each_valid_entry(pos, table) \ + for (pos = table; cpufreq_next_valid(&pos); pos++) + int cpufreq_frequency_table_cpuinfo(struct cpufreq_policy *policy, struct cpufreq_frequency_table *table); -- cgit v1.2.3-70-g09d2 From dec102aa9ac112d66133314815d20233c96ad749 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Tue, 22 Apr 2014 10:42:05 +0530 Subject: cpufreq: Make linux-pm@vger.kernel.org official mailing list There has been confusion all the time about which mailing list to follow for cpufreq activities, linux-pm@vger.kernel.org or cpufreq@vger.kernel.org. Since patches sent to cpufreq@vger.kernel.org don't go to Patchwork which is a maintenance workflow problem, make linux-pm@vger.kernel.org the official mailing list for cpufreq stuff and remove all references of cpufreq@vger.kernel.org from kernel source. Later, we can request that the list be dropped entirely. Signed-off-by: Viresh Kumar [rjw: Changelog] Signed-off-by: Rafael J. Wysocki --- Documentation/ABI/testing/sysfs-devices-system-cpu | 4 ++-- Documentation/cpu-freq/index.txt | 4 ++-- MAINTAINERS | 2 -- drivers/cpufreq/speedstep-centrino.c | 2 +- tools/power/cpupower/Makefile | 2 +- tools/power/cpupower/debug/kernel/cpufreq-test_tsc.c | 2 +- 6 files changed, 7 insertions(+), 9 deletions(-) (limited to 'Documentation') diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu index d5a0d33c571..acb9bfc89b4 100644 --- a/Documentation/ABI/testing/sysfs-devices-system-cpu +++ b/Documentation/ABI/testing/sysfs-devices-system-cpu @@ -128,7 +128,7 @@ Description: Discover cpuidle policy and mechanism What: /sys/devices/system/cpu/cpu#/cpufreq/* Date: pre-git history -Contact: cpufreq@vger.kernel.org +Contact: linux-pm@vger.kernel.org Description: Discover and change clock speed of CPUs Clock scaling allows you to change the clock speed of the @@ -146,7 +146,7 @@ Description: Discover and change clock speed of CPUs What: /sys/devices/system/cpu/cpu#/cpufreq/freqdomain_cpus Date: June 2013 -Contact: cpufreq@vger.kernel.org +Contact: linux-pm@vger.kernel.org Description: Discover CPUs in the same CPU frequency coordination domain freqdomain_cpus is the list of CPUs (online+offline) that share diff --git a/Documentation/cpu-freq/index.txt b/Documentation/cpu-freq/index.txt index 3d0b915035b..dc024ab4054 100644 --- a/Documentation/cpu-freq/index.txt +++ b/Documentation/cpu-freq/index.txt @@ -35,8 +35,8 @@ Mailing List ------------ There is a CPU frequency changing CVS commit and general list where you can report bugs, problems or submit patches. To post a message, -send an email to cpufreq@vger.kernel.org, to subscribe go to -http://vger.kernel.org/vger-lists.html#cpufreq and follow the +send an email to linux-pm@vger.kernel.org, to subscribe go to +http://vger.kernel.org/vger-lists.html#linux-pm and follow the instructions there. Links diff --git a/MAINTAINERS b/MAINTAINERS index e67ea244204..433163dd935 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2415,7 +2415,6 @@ F: drivers/net/ethernet/ti/cpmac.c CPU FREQUENCY DRIVERS M: Rafael J. Wysocki M: Viresh Kumar -L: cpufreq@vger.kernel.org L: linux-pm@vger.kernel.org S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm.git @@ -2426,7 +2425,6 @@ F: include/linux/cpufreq.h CPU FREQUENCY DRIVERS - ARM BIG LITTLE M: Viresh Kumar M: Sudeep Holla -L: cpufreq@vger.kernel.org L: linux-pm@vger.kernel.org W: http://www.arm.com/products/processors/technologies/biglittleprocessing.php S: Maintained diff --git a/drivers/cpufreq/speedstep-centrino.c b/drivers/cpufreq/speedstep-centrino.c index 6723f0390f2..7d4a3157160 100644 --- a/drivers/cpufreq/speedstep-centrino.c +++ b/drivers/cpufreq/speedstep-centrino.c @@ -28,7 +28,7 @@ #include #define PFX "speedstep-centrino: " -#define MAINTAINER "cpufreq@vger.kernel.org" +#define MAINTAINER "linux-pm@vger.kernel.org" #define INTEL_MSR_RANGE (0xffff) diff --git a/tools/power/cpupower/Makefile b/tools/power/cpupower/Makefile index cbfec92af32..3651db7eda2 100644 --- a/tools/power/cpupower/Makefile +++ b/tools/power/cpupower/Makefile @@ -62,7 +62,7 @@ LIB_MAJ= 0.0.0 LIB_MIN= 0 PACKAGE = cpupower -PACKAGE_BUGREPORT = cpufreq@vger.kernel.org +PACKAGE_BUGREPORT = linux-pm@vger.kernel.org LANGUAGES = de fr it cs pt diff --git a/tools/power/cpupower/debug/kernel/cpufreq-test_tsc.c b/tools/power/cpupower/debug/kernel/cpufreq-test_tsc.c index 0f10b81e332..5224ee5b392 100644 --- a/tools/power/cpupower/debug/kernel/cpufreq-test_tsc.c +++ b/tools/power/cpupower/debug/kernel/cpufreq-test_tsc.c @@ -18,7 +18,7 @@ * 5.) if the third value, "diff_pmtmr", changes between 2. and 4., the * TSC-based delay routine on the Linux kernel does not correctly * handle the cpufreq transition. Please report this to - * cpufreq@vger.kernel.org + * linux-pm@vger.kernel.org */ #include -- cgit v1.2.3-70-g09d2 From a0dd7b79657bd6644b914d16ce7f23468c44a7b4 Mon Sep 17 00:00:00 2001 From: Nishanth Menon Date: Mon, 5 May 2014 08:33:50 -0500 Subject: PM / OPP: Move cpufreq specific OPP functions out of generic OPP library CPUFreq specific helper functions for OPP (Operating Performance Points) now use generic OPP functions that allow CPUFreq to be be moved back into CPUFreq framework. This allows for independent modifications or future enhancements as needed isolated to just CPUFreq framework alone. Here, we just move relevant code and documentation to make this part of CPUFreq infrastructure. Cc: Kevin Hilman Signed-off-by: Nishanth Menon Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- Documentation/cpu-freq/core.txt | 29 +++++++++++ Documentation/power/opp.txt | 40 ++------------- drivers/base/power/opp.c | 92 --------------------------------- drivers/cpufreq/Makefile | 2 + drivers/cpufreq/cpufreq_opp.c | 110 ++++++++++++++++++++++++++++++++++++++++ include/linux/cpufreq.h | 21 ++++++++ include/linux/pm_opp.h | 20 -------- 7 files changed, 167 insertions(+), 147 deletions(-) create mode 100644 drivers/cpufreq/cpufreq_opp.c (limited to 'Documentation') diff --git a/Documentation/cpu-freq/core.txt b/Documentation/cpu-freq/core.txt index 0060d76b445..70933eadc30 100644 --- a/Documentation/cpu-freq/core.txt +++ b/Documentation/cpu-freq/core.txt @@ -20,6 +20,7 @@ Contents: --------- 1. CPUFreq core and interfaces 2. CPUFreq notifiers +3. CPUFreq Table Generation with Operating Performance Point (OPP) 1. General Information ======================= @@ -92,3 +93,31 @@ values: cpu - number of the affected CPU old - old frequency new - new frequency + +3. CPUFreq Table Generation with Operating Performance Point (OPP) +================================================================== +For details about OPP, see Documentation/power/opp.txt + +dev_pm_opp_init_cpufreq_table - cpufreq framework typically is initialized with + cpufreq_frequency_table_cpuinfo which is provided with the list of + frequencies that are available for operation. This function provides + a ready to use conversion routine to translate the OPP layer's internal + information about the available frequencies into a format readily + providable to cpufreq. + + WARNING: Do not use this function in interrupt context. + + Example: + soc_pm_init() + { + /* Do things */ + r = dev_pm_opp_init_cpufreq_table(dev, &freq_table); + if (!r) + cpufreq_frequency_table_cpuinfo(policy, freq_table); + /* Do other things */ + } + + NOTE: This function is available only if CONFIG_CPU_FREQ is enabled in + addition to CONFIG_PM_OPP. + +dev_pm_opp_free_cpufreq_table - Free up the table allocated by dev_pm_opp_init_cpufreq_table diff --git a/Documentation/power/opp.txt b/Documentation/power/opp.txt index b8a907dc016..a9adad828cd 100644 --- a/Documentation/power/opp.txt +++ b/Documentation/power/opp.txt @@ -10,8 +10,7 @@ Contents 3. OPP Search Functions 4. OPP Availability Control Functions 5. OPP Data Retrieval Functions -6. Cpufreq Table Generation -7. Data Structures +6. Data Structures 1. Introduction =============== @@ -72,7 +71,6 @@ operations until that OPP could be re-enabled if possible. OPP library facilitates this concept in it's implementation. The following operational functions operate only on available opps: opp_find_freq_{ceil, floor}, dev_pm_opp_get_voltage, dev_pm_opp_get_freq, dev_pm_opp_get_opp_count -and dev_pm_opp_init_cpufreq_table dev_pm_opp_find_freq_exact is meant to be used to find the opp pointer which can then be used for dev_pm_opp_enable/disable functions to make an opp available as required. @@ -96,10 +94,9 @@ using RCU read locks. The opp_find_freq_{exact,ceil,floor}, opp_get_{voltage, freq, opp_count} fall into this category. opp_{add,enable,disable} are updaters which use mutex and implement it's own -RCU locking mechanisms. dev_pm_opp_init_cpufreq_table acts as an updater and uses -mutex to implment RCU updater strategy. These functions should *NOT* be called -under RCU locks and other contexts that prevent blocking functions in RCU or -mutex operations from working. +RCU locking mechanisms. These functions should *NOT* be called under RCU locks +and other contexts that prevent blocking functions in RCU or mutex operations +from working. 2. Initial OPP List Registration ================================ @@ -311,34 +308,7 @@ dev_pm_opp_get_opp_count - Retrieve the number of available opps for a device /* Do other things */ } -6. Cpufreq Table Generation -=========================== -dev_pm_opp_init_cpufreq_table - cpufreq framework typically is initialized with - cpufreq_frequency_table_cpuinfo which is provided with the list of - frequencies that are available for operation. This function provides - a ready to use conversion routine to translate the OPP layer's internal - information about the available frequencies into a format readily - providable to cpufreq. - - WARNING: Do not use this function in interrupt context. - - Example: - soc_pm_init() - { - /* Do things */ - r = dev_pm_opp_init_cpufreq_table(dev, &freq_table); - if (!r) - cpufreq_frequency_table_cpuinfo(policy, freq_table); - /* Do other things */ - } - - NOTE: This function is available only if CONFIG_CPU_FREQ is enabled in - addition to CONFIG_PM as power management feature is required to - dynamically scale voltage and frequency in a system. - -dev_pm_opp_free_cpufreq_table - Free up the table allocated by dev_pm_opp_init_cpufreq_table - -7. Data Structures +6. Data Structures ================== Typically an SoC contains multiple voltage domains which are variable. Each domain is represented by a device pointer. The relationship to OPP can be diff --git a/drivers/base/power/opp.c b/drivers/base/power/opp.c index 38b43bb2087..d9e376a6d19 100644 --- a/drivers/base/power/opp.c +++ b/drivers/base/power/opp.c @@ -15,7 +15,6 @@ #include #include #include -#include #include #include #include @@ -596,97 +595,6 @@ int dev_pm_opp_disable(struct device *dev, unsigned long freq) } EXPORT_SYMBOL_GPL(dev_pm_opp_disable); -#ifdef CONFIG_CPU_FREQ -/** - * dev_pm_opp_init_cpufreq_table() - create a cpufreq table for a device - * @dev: device for which we do this operation - * @table: Cpufreq table returned back to caller - * - * Generate a cpufreq table for a provided device- this assumes that the - * opp list is already initialized and ready for usage. - * - * This function allocates required memory for the cpufreq table. It is - * expected that the caller does the required maintenance such as freeing - * the table as required. - * - * Returns -EINVAL for bad pointers, -ENODEV if the device is not found, -ENOMEM - * if no memory available for the operation (table is not populated), returns 0 - * if successful and table is populated. - * - * WARNING: It is important for the callers to ensure refreshing their copy of - * the table if any of the mentioned functions have been invoked in the interim. - * - * Locking: The internal device_opp and opp structures are RCU protected. - * Since we just use the regular accessor functions to access the internal data - * structures, we use RCU read lock inside this function. As a result, users of - * this function DONOT need to use explicit locks for invoking. - */ -int dev_pm_opp_init_cpufreq_table(struct device *dev, - struct cpufreq_frequency_table **table) -{ - struct dev_pm_opp *opp; - struct cpufreq_frequency_table *freq_table = NULL; - int i, max_opps, ret = 0; - unsigned long rate; - - rcu_read_lock(); - - max_opps = dev_pm_opp_get_opp_count(dev); - if (max_opps <= 0) { - ret = max_opps ? max_opps : -ENODATA; - goto out; - } - - freq_table = kzalloc(sizeof(*freq_table) * (max_opps + 1), GFP_KERNEL); - if (!freq_table) { - ret = -ENOMEM; - goto out; - } - - for (i = 0, rate = 0; i < max_opps; i++, rate++) { - /* find next rate */ - opp = dev_pm_opp_find_freq_ceil(dev, &rate); - if (IS_ERR(opp)) { - ret = PTR_ERR(opp); - goto out; - } - freq_table[i].driver_data = i; - freq_table[i].frequency = rate / 1000; - } - - freq_table[i].driver_data = i; - freq_table[i].frequency = CPUFREQ_TABLE_END; - - *table = &freq_table[0]; - -out: - rcu_read_unlock(); - if (ret) - kfree(freq_table); - - return ret; -} -EXPORT_SYMBOL_GPL(dev_pm_opp_init_cpufreq_table); - -/** - * dev_pm_opp_free_cpufreq_table() - free the cpufreq table - * @dev: device for which we do this operation - * @table: table to free - * - * Free up the table allocated by dev_pm_opp_init_cpufreq_table - */ -void dev_pm_opp_free_cpufreq_table(struct device *dev, - struct cpufreq_frequency_table **table) -{ - if (!table) - return; - - kfree(*table); - *table = NULL; -} -EXPORT_SYMBOL_GPL(dev_pm_opp_free_cpufreq_table); -#endif /* CONFIG_CPU_FREQ */ - /** * dev_pm_opp_get_notifier() - find notifier_head of the device with opp * @dev: device pointer used to lookup device OPPs. diff --git a/drivers/cpufreq/Makefile b/drivers/cpufreq/Makefile index 0dbb963c1ae..738c8b7b17d 100644 --- a/drivers/cpufreq/Makefile +++ b/drivers/cpufreq/Makefile @@ -1,5 +1,7 @@ # CPUfreq core obj-$(CONFIG_CPU_FREQ) += cpufreq.o freq_table.o +obj-$(CONFIG_PM_OPP) += cpufreq_opp.o + # CPUfreq stats obj-$(CONFIG_CPU_FREQ_STAT) += cpufreq_stats.o diff --git a/drivers/cpufreq/cpufreq_opp.c b/drivers/cpufreq/cpufreq_opp.c new file mode 100644 index 00000000000..c0c6f4a4ecc --- /dev/null +++ b/drivers/cpufreq/cpufreq_opp.c @@ -0,0 +1,110 @@ +/* + * Generic OPP helper interface for CPUFreq drivers + * + * Copyright (C) 2009-2014 Texas Instruments Incorporated. + * Nishanth Menon + * Romit Dasgupta + * Kevin Hilman + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/** + * dev_pm_opp_init_cpufreq_table() - create a cpufreq table for a device + * @dev: device for which we do this operation + * @table: Cpufreq table returned back to caller + * + * Generate a cpufreq table for a provided device- this assumes that the + * opp list is already initialized and ready for usage. + * + * This function allocates required memory for the cpufreq table. It is + * expected that the caller does the required maintenance such as freeing + * the table as required. + * + * Returns -EINVAL for bad pointers, -ENODEV if the device is not found, -ENOMEM + * if no memory available for the operation (table is not populated), returns 0 + * if successful and table is populated. + * + * WARNING: It is important for the callers to ensure refreshing their copy of + * the table if any of the mentioned functions have been invoked in the interim. + * + * Locking: The internal device_opp and opp structures are RCU protected. + * Since we just use the regular accessor functions to access the internal data + * structures, we use RCU read lock inside this function. As a result, users of + * this function DONOT need to use explicit locks for invoking. + */ +int dev_pm_opp_init_cpufreq_table(struct device *dev, + struct cpufreq_frequency_table **table) +{ + struct dev_pm_opp *opp; + struct cpufreq_frequency_table *freq_table = NULL; + int i, max_opps, ret = 0; + unsigned long rate; + + rcu_read_lock(); + + max_opps = dev_pm_opp_get_opp_count(dev); + if (max_opps <= 0) { + ret = max_opps ? max_opps : -ENODATA; + goto out; + } + + freq_table = kzalloc(sizeof(*freq_table) * (max_opps + 1), GFP_KERNEL); + if (!freq_table) { + ret = -ENOMEM; + goto out; + } + + for (i = 0, rate = 0; i < max_opps; i++, rate++) { + /* find next rate */ + opp = dev_pm_opp_find_freq_ceil(dev, &rate); + if (IS_ERR(opp)) { + ret = PTR_ERR(opp); + goto out; + } + freq_table[i].driver_data = i; + freq_table[i].frequency = rate / 1000; + } + + freq_table[i].driver_data = i; + freq_table[i].frequency = CPUFREQ_TABLE_END; + + *table = &freq_table[0]; + +out: + rcu_read_unlock(); + if (ret) + kfree(freq_table); + + return ret; +} +EXPORT_SYMBOL_GPL(dev_pm_opp_init_cpufreq_table); + +/** + * dev_pm_opp_free_cpufreq_table() - free the cpufreq table + * @dev: device for which we do this operation + * @table: table to free + * + * Free up the table allocated by dev_pm_opp_init_cpufreq_table + */ +void dev_pm_opp_free_cpufreq_table(struct device *dev, + struct cpufreq_frequency_table **table) +{ + if (!table) + return; + + kfree(*table); + *table = NULL; +} +EXPORT_SYMBOL_GPL(dev_pm_opp_free_cpufreq_table); diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index f3822f836e1..9d803b529ac 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -469,6 +469,27 @@ struct cpufreq_frequency_table { * order */ }; +#if defined(CONFIG_CPU_FREQ) && defined(CONFIG_PM_OPP) +int dev_pm_opp_init_cpufreq_table(struct device *dev, + struct cpufreq_frequency_table **table); +void dev_pm_opp_free_cpufreq_table(struct device *dev, + struct cpufreq_frequency_table **table); +#else +static inline int dev_pm_opp_init_cpufreq_table(struct device *dev, + struct cpufreq_frequency_table + **table) +{ + return -EINVAL; +} + +static inline void dev_pm_opp_free_cpufreq_table(struct device *dev, + struct cpufreq_frequency_table + **table) +{ +} +#endif + + bool cpufreq_next_valid(struct cpufreq_frequency_table **pos); /* diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h index 5151b005958..0330217abfa 100644 --- a/include/linux/pm_opp.h +++ b/include/linux/pm_opp.h @@ -15,7 +15,6 @@ #define __LINUX_OPP_H__ #include -#include #include struct dev_pm_opp; @@ -117,23 +116,4 @@ static inline int of_init_opp_table(struct device *dev) } #endif -#if defined(CONFIG_CPU_FREQ) && defined(CONFIG_PM_OPP) -int dev_pm_opp_init_cpufreq_table(struct device *dev, - struct cpufreq_frequency_table **table); -void dev_pm_opp_free_cpufreq_table(struct device *dev, - struct cpufreq_frequency_table **table); -#else -static inline int dev_pm_opp_init_cpufreq_table(struct device *dev, - struct cpufreq_frequency_table **table) -{ - return -EINVAL; -} - -static inline -void dev_pm_opp_free_cpufreq_table(struct device *dev, - struct cpufreq_frequency_table **table) -{ -} -#endif /* CONFIG_CPU_FREQ */ - #endif /* __LINUX_OPP_H__ */ -- cgit v1.2.3-70-g09d2 From 886129a8eebebec260165741fe31421482371006 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Tue, 6 May 2014 14:46:23 +0200 Subject: ACPI / video: change acpi-video brightness_switch_enabled default to 0 acpi-video is unique in that it not only generates brightness up/down keypresses, but also (sometimes) actively changes the brightness itself. This presents an inconsistent kernel interface to userspace, basically there are 2 different scenarios, depending on the laptop model: 1) On some laptops a brightness up/down keypress means: show a brightness osd with the current brightness, iow it is a brightness has changed notification. 2) Where as on (a lot of) other laptops it means a brightness up/down key was pressed, deal with it. Most of the desktop environments interpret any press as in scenario 2, and change the brightness up / down as a response to the key events, causing it to be changed twice, once by acpi-video and once by the DE. With the new default for video.use_native_backlight we will be moving even more laptops over to behaving as in scenario 2. Making the remaining laptops even more of a weird exception. Also note that it is hard to detect scenario 1 properly in userspace, and AFAIK none of the DE-s deals with it. Therefor this commit changes the default of brightness_switch_enabled to 0 making its behavior consistent with all the other backlight drivers. Signed-off-by: Hans de Goede Reviewed-by: Aaron Lu Signed-off-by: Rafael J. Wysocki --- Documentation/kernel-parameters.txt | 2 +- drivers/acpi/video.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'Documentation') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 43842177b77..cc2c243ac78 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -3461,7 +3461,7 @@ bytes respectively. Such letter suffixes can also be entirely omitted. the allocated input device; If set to 0, video driver will only send out the event without touching backlight brightness level. - default: 1 + default: 0 virtio_mmio.device= [VMMIO] Memory mapped virtio (platform) device. diff --git a/drivers/acpi/video.c b/drivers/acpi/video.c index 88393899a0b..fced27d8e42 100644 --- a/drivers/acpi/video.c +++ b/drivers/acpi/video.c @@ -68,7 +68,7 @@ MODULE_AUTHOR("Bruno Ducrot"); MODULE_DESCRIPTION("ACPI Video Driver"); MODULE_LICENSE("GPL"); -static bool brightness_switch_enabled = 1; +static bool brightness_switch_enabled; module_param(brightness_switch_enabled, bool, 0644); /* -- cgit v1.2.3-70-g09d2 From 8a54cd5bd6ebf009b96ec79510b593f7ba5c0ff3 Mon Sep 17 00:00:00 2001 From: Pali Rohár Date: Tue, 6 May 2014 13:01:56 +0200 Subject: PM / hibernate: Documentation: Fix script for unswapping MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit System can have mmaped also character devices (e.g dri devices by X) or deleted files. Running cat on character devices is really bad idea (system can hang) so run cat only on regular files. Also mmaped files can have spaces in filenames. Signed-off-by: Pali Rohár [rjw: Subject] Signed-off-by: Rafael J. Wysocki --- Documentation/power/swsusp.txt | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'Documentation') diff --git a/Documentation/power/swsusp.txt b/Documentation/power/swsusp.txt index 079160e22bc..f732a8321e8 100644 --- a/Documentation/power/swsusp.txt +++ b/Documentation/power/swsusp.txt @@ -220,7 +220,10 @@ Q: After resuming, system is paging heavily, leading to very bad interactivity. A: Try running -cat `cat /proc/[0-9]*/maps | grep / | sed 's:.* /:/:' | sort -u` > /dev/null +cat /proc/[0-9]*/maps | grep / | sed 's:.* /:/:' | sort -u | while read file +do + test -f "$file" && cat "$file" > /dev/null +done after resume. swapoff -a; swapon -a may also be useful. -- cgit v1.2.3-70-g09d2 From 4ec6a9cc23cbadb4721eb3fe778389c4bbede893 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 16 May 2014 13:05:59 +0200 Subject: PM / runtime: Update documentation to reflect the current code flow The runtime PM documentation in runtime_pm.txt has not been updated after some changes to the system suspend and resume core code, so update it to reflect the current code flow. Signed-off-by: Rafael J. Wysocki Acked-by: Alan Stern --- Documentation/power/runtime_pm.txt | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) (limited to 'Documentation') diff --git a/Documentation/power/runtime_pm.txt b/Documentation/power/runtime_pm.txt index 5f96daf8566..054893e093c 100644 --- a/Documentation/power/runtime_pm.txt +++ b/Documentation/power/runtime_pm.txt @@ -648,15 +648,17 @@ The PM core does its best to reduce the probability of race conditions between the runtime PM and system suspend/resume (and hibernation) callbacks by carrying out the following operations: - * During system suspend it calls pm_runtime_get_noresume() and - pm_runtime_barrier() for every device right before executing the - subsystem-level .suspend() callback for it. In addition to that it calls - __pm_runtime_disable() with 'false' as the second argument for every device - right before executing the subsystem-level .suspend_late() callback for it. - - * During system resume it calls pm_runtime_enable() and pm_runtime_put() - for every device right after executing the subsystem-level .resume_early() - callback and right after executing the subsystem-level .resume() callback + * During system suspend pm_runtime_get_noresume() is called for every device + right before executing the subsystem-level .prepare() callback for it and + pm_runtime_barrier() is called for every device right before executing the + subsystem-level .suspend() callback for it. In addition to that the PM core + calls __pm_runtime_disable() with 'false' as the second argument for every + device right before executing the subsystem-level .suspend_late() callback + for it. + + * During system resume pm_runtime_enable() and pm_runtime_put() are called for + every device right after executing the subsystem-level .resume_early() + callback and right after executing the subsystem-level .complete() callback for it, respectively. 7. Generic subsystem callbacks -- cgit v1.2.3-70-g09d2 From f71495f3f0c5f0801823d1235b271a4a415d3df8 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 16 May 2014 02:47:37 +0200 Subject: PM / sleep: Update device PM documentation to cover direct_complete Update the device PM documentation in devices.txt and runtime_pm.txt to reflect the changes in the system suspend and resume handling related to the introduction of the new power.direct_complete flag. Signed-off-by: Rafael J. Wysocki Acked-by: Alan Stern --- Documentation/power/devices.txt | 34 ++++++++++++++++++++++++++++++---- Documentation/power/runtime_pm.txt | 17 +++++++++++++++++ 2 files changed, 47 insertions(+), 4 deletions(-) (limited to 'Documentation') diff --git a/Documentation/power/devices.txt b/Documentation/power/devices.txt index 47d46dff70f..d172bce0fd4 100644 --- a/Documentation/power/devices.txt +++ b/Documentation/power/devices.txt @@ -2,6 +2,7 @@ Device Power Management Copyright (c) 2010-2011 Rafael J. Wysocki , Novell Inc. Copyright (c) 2010 Alan Stern +Copyright (c) 2014 Intel Corp., Rafael J. Wysocki Most of the code in Linux is device drivers, so most of the Linux power @@ -326,6 +327,20 @@ the phases are: driver in some way for the upcoming system power transition, but it should not put the device into a low-power state. + For devices supporting runtime power management, the return value of the + prepare callback can be used to indicate to the PM core that it may + safely leave the device in runtime suspend (if runtime-suspended + already), provided that all of the device's descendants are also left in + runtime suspend. Namely, if the prepare callback returns a positive + number and that happens for all of the descendants of the device too, + and all of them (including the device itself) are runtime-suspended, the + PM core will skip the suspend, suspend_late and suspend_noirq suspend + phases as well as the resume_noirq, resume_early and resume phases of + the following system resume for all of these devices. In that case, + the complete callback will be called directly after the prepare callback + and is entirely responsible for bringing the device back to the + functional state as appropriate. + 2. The suspend methods should quiesce the device to stop it from performing I/O. They also may save the device registers and put it into the appropriate low-power state, depending on the bus type the device is on, @@ -400,12 +415,23 @@ When resuming from freeze, standby or memory sleep, the phases are: the resume callbacks occur; it's not necessary to wait until the complete phase. + Moreover, if the preceding prepare callback returned a positive number, + the device may have been left in runtime suspend throughout the whole + system suspend and resume (the suspend, suspend_late, suspend_noirq + phases of system suspend and the resume_noirq, resume_early, resume + phases of system resume may have been skipped for it). In that case, + the complete callback is entirely responsible for bringing the device + back to the functional state after system suspend if necessary. [For + example, it may need to queue up a runtime resume request for the device + for this purpose.] To check if that is the case, the complete callback + can consult the device's power.direct_complete flag. Namely, if that + flag is set when the complete callback is being run, it has been called + directly after the preceding prepare and special action may be required + to make the device work correctly afterward. + At the end of these phases, drivers should be as functional as they were before suspending: I/O can be performed using DMA and IRQs, and the relevant clocks are -gated on. Even if the device was in a low-power state before the system sleep -because of runtime power management, afterwards it should be back in its -full-power state. There are multiple reasons why it's best to do this; they are -discussed in more detail in Documentation/power/runtime_pm.txt. +gated on. However, the details here may again be platform-specific. For example, some systems support multiple "run" states, and the mode in effect at diff --git a/Documentation/power/runtime_pm.txt b/Documentation/power/runtime_pm.txt index 5f96daf8566..e1bee8a4aaa 100644 --- a/Documentation/power/runtime_pm.txt +++ b/Documentation/power/runtime_pm.txt @@ -2,6 +2,7 @@ Runtime Power Management Framework for I/O Devices (C) 2009-2011 Rafael J. Wysocki , Novell Inc. (C) 2010 Alan Stern +(C) 2014 Intel Corp., Rafael J. Wysocki 1. Introduction @@ -444,6 +445,10 @@ drivers/base/power/runtime.c and include/linux/pm_runtime.h: bool pm_runtime_status_suspended(struct device *dev); - return true if the device's runtime PM status is 'suspended' + bool pm_runtime_suspended_if_enabled(struct device *dev); + - return true if the device's runtime PM status is 'suspended' and its + 'power.disable_depth' field is equal to 1 + void pm_runtime_allow(struct device *dev); - set the power.runtime_auto flag for the device and decrease its usage counter (used by the /sys/devices/.../power/control interface to @@ -644,6 +649,18 @@ place (in particular, if the system is not waking up from hibernation), it may be more efficient to leave the devices that had been suspended before the system suspend began in the suspended state. +To this end, the PM core provides a mechanism allowing some coordination between +different levels of device hierarchy. Namely, if a system suspend .prepare() +callback returns a positive number for a device, that indicates to the PM core +that the device appears to be runtime-suspended and its state is fine, so it +may be left in runtime suspend provided that all of its descendants are also +left in runtime suspend. If that happens, the PM core will not execute any +system suspend and resume callbacks for all of those devices, except for the +complete callback, which is then entirely responsible for handling the device +as appropriate. This only applies to system suspend transitions that are not +related to hibernation (see Documentation/power/devices.txt for more +information). + The PM core does its best to reduce the probability of race conditions between the runtime PM and system suspend/resume (and hibernation) callbacks by carrying out the following operations: -- cgit v1.2.3-70-g09d2 From 0399d4db3edf5c58b6ec7f672f089f5085e49ed5 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 26 May 2014 13:40:59 +0200 Subject: PM / sleep: Introduce command line argument for sleep state enumeration On some systems the platform doesn't support neither PM_SUSPEND_MEM nor PM_SUSPEND_STANDBY, so PM_SUSPEND_FREEZE is the only available system sleep state. However, some user space frameworks only use the "mem" and (sometimes) "standby" sleep state labels, so the users of those systems need to modify user space in order to be able to use system suspend at all and that is not always possible. For this reason, add a new kernel command line argument, relative_sleep_states, allowing the users of those systems to change the way in which the kernel assigns labels to system sleep states. Namely, for relative_sleep_states=1, the "mem", "standby" and "freeze" labels will enumerate the available system sleem states from the deepest to the shallowest, respectively, so that "mem" is always present in /sys/power/state and the other state strings may or may not be presend depending on what is supported by the platform. Update system sleep states documentation to reflect this change. Signed-off-by: Rafael J. Wysocki --- Documentation/ABI/testing/sysfs-power | 29 ++++++++---- Documentation/kernel-parameters.txt | 7 +++ Documentation/power/states.txt | 87 ++++++++++++++++++++++------------- kernel/power/main.c | 12 ++--- kernel/power/suspend.c | 32 ++++++++++++- 5 files changed, 119 insertions(+), 48 deletions(-) (limited to 'Documentation') diff --git a/Documentation/ABI/testing/sysfs-power b/Documentation/ABI/testing/sysfs-power index 64c9276e942..f4551816329 100644 --- a/Documentation/ABI/testing/sysfs-power +++ b/Documentation/ABI/testing/sysfs-power @@ -7,19 +7,30 @@ Description: subsystem. What: /sys/power/state -Date: August 2006 +Date: May 2014 Contact: Rafael J. Wysocki Description: - The /sys/power/state file controls the system power state. - Reading from this file returns what states are supported, - which is hard-coded to 'freeze' (Low-Power Idle), 'standby' - (Power-On Suspend), 'mem' (Suspend-to-RAM), and 'disk' - (Suspend-to-Disk). + The /sys/power/state file controls system sleep states. + Reading from this file returns the available sleep state + labels, which may be "mem", "standby", "freeze" and "disk" + (hibernation). The meanings of the first three labels depend on + the relative_sleep_states command line argument as follows: + 1) relative_sleep_states = 1 + "mem", "standby", "freeze" represent non-hibernation sleep + states from the deepest ("mem", always present) to the + shallowest ("freeze"). "standby" and "freeze" may or may + not be present depending on the capabilities of the + platform. "freeze" can only be present if "standby" is + present. + 2) relative_sleep_states = 0 (default) + "mem" - "suspend-to-RAM", present if supported. + "standby" - "power-on suspend", present if supported. + "freeze" - "suspend-to-idle", always present. Writing to this file one of these strings causes the system to - transition into that state. Please see the file - Documentation/power/states.txt for a description of each of - these states. + transition into the corresponding state, if available. See + Documentation/power/states.txt for a description of what + "suspend-to-RAM", "power-on suspend" and "suspend-to-idle" mean. What: /sys/power/disk Date: September 2006 diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 43842177b77..e19a88b63ee 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -2889,6 +2889,13 @@ bytes respectively. Such letter suffixes can also be entirely omitted. [KNL, SMP] Set scheduler's default relax_domain_level. See Documentation/cgroups/cpusets.txt. + relative_sleep_states= + [SUSPEND] Use sleep state labeling where the deepest + state available other than hibernation is always "mem". + Format: { "0" | "1" } + 0 -- Traditional sleep state labels. + 1 -- Relative sleep state labels. + reserve= [KNL,BUGS] Force the kernel to ignore some iomem area reservetop= [X86-32] diff --git a/Documentation/power/states.txt b/Documentation/power/states.txt index 442d43df9b2..50f3ef9177c 100644 --- a/Documentation/power/states.txt +++ b/Documentation/power/states.txt @@ -1,62 +1,87 @@ +System Power Management Sleep States -System Power Management States +(C) 2014 Intel Corp., Rafael J. Wysocki +The kernel supports up to four system sleep states generically, although three +of them depend on the platform support code to implement the low-level details +for each state. -The kernel supports four power management states generically, though -one is generic and the other three are dependent on platform support -code to implement the low-level details for each state. -This file describes each state, what they are -commonly called, what ACPI state they map to, and what string to write -to /sys/power/state to enter that state +The states are represented by strings that can be read or written to the +/sys/power/state file. Those strings may be "mem", "standby", "freeze" and +"disk", where the last one always represents hibernation (Suspend-To-Disk) and +the meaning of the remaining ones depends on the relative_sleep_states command +line argument. -state: Freeze / Low-Power Idle +For relative_sleep_states=1, the strings "mem", "standby" and "freeze" label the +available non-hibernation sleep states from the deepest to the shallowest, +respectively. In that case, "mem" is always present in /sys/power/state, +because there is at least one non-hibernation sleep state in every system. If +the given system supports two non-hibernation sleep states, "standby" is present +in /sys/power/state in addition to "mem". If the system supports three +non-hibernation sleep states, "freeze" will be present in /sys/power/state in +addition to "mem" and "standby". + +For relative_sleep_states=0, which is the default, the following descriptions +apply. + +state: Suspend-To-Idle ACPI state: S0 -String: "freeze" +Label: "freeze" -This state is a generic, pure software, light-weight, low-power state. -It allows more energy to be saved relative to idle by freezing user +This state is a generic, pure software, light-weight, system sleep state. +It allows more energy to be saved relative to runtime idle by freezing user space and putting all I/O devices into low-power states (possibly lower-power than available at run time), such that the processors can spend more time in their idle states. -This state can be used for platforms without Standby/Suspend-to-RAM + +This state can be used for platforms without Power-On Suspend/Suspend-to-RAM support, or it can be used in addition to Suspend-to-RAM (memory sleep) -to provide reduced resume latency. +to provide reduced resume latency. It is always supported. State: Standby / Power-On Suspend ACPI State: S1 -String: "standby" +Label: "standby" -This state offers minimal, though real, power savings, while providing -a very low-latency transition back to a working system. No operating -state is lost (the CPU retains power), so the system easily starts up +This state, if supported, offers moderate, though real, power savings, while +providing a relatively low-latency transition back to a working system. No +operating state is lost (the CPU retains power), so the system easily starts up again where it left off. -We try to put devices in a low-power state equivalent to D1, which -also offers low power savings, but low resume latency. Not all devices -support D1, and those that don't are left on. +In addition to freezing user space and putting all I/O devices into low-power +states, which is done for Suspend-To-Idle too, nonboot CPUs are taken offline +and all low-level system functions are suspended during transitions into this +state. For this reason, it should allow more energy to be saved relative to +Suspend-To-Idle, but the resume latency will generally be greater than for that +state. State: Suspend-to-RAM ACPI State: S3 -String: "mem" +Label: "mem" -This state offers significant power savings as everything in the -system is put into a low-power state, except for memory, which is -placed in self-refresh mode to retain its contents. +This state, if supported, offers significant power savings as everything in the +system is put into a low-power state, except for memory, which should be placed +into the self-refresh mode to retain its contents. All of the steps carried out +when entering Power-On Suspend are also carried out during transitions to STR. +Additional operations may take place depending on the platform capabilities. In +particular, on ACPI systems the kernel passes control to the BIOS (platform +firmware) as the last step during STR transitions and that usually results in +powering down some more low-level components that aren't directly controlled by +the kernel. -System and device state is saved and kept in memory. All devices are -suspended and put into D3. In many cases, all peripheral buses lose -power when entering STR, so devices must be able to handle the -transition back to the On state. +System and device state is saved and kept in memory. All devices are suspended +and put into low-power states. In many cases, all peripheral buses lose power +when entering STR, so devices must be able to handle the transition back to the +"on" state. -For at least ACPI, STR requires some minimal boot-strapping code to -resume the system from STR. This may be true on other platforms. +For at least ACPI, STR requires some minimal boot-strapping code to resume the +system from it. This may be the case on other platforms too. State: Suspend-to-disk ACPI State: S4 -String: "disk" +Label: "disk" This state offers the greatest power savings, and can be used even in the absence of low-level platform support for power management. This diff --git a/kernel/power/main.c b/kernel/power/main.c index 9f51f0ab3d8..573410d6647 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -279,14 +279,14 @@ static inline void pm_print_times_init(void) {} struct kobject *power_kobj; /** - * state - control system power state. + * state - control system sleep states. * - * show() returns what states are supported, which is hard-coded to - * 'freeze' (Low-Power Idle), 'standby' (Power-On Suspend), - * 'mem' (Suspend-to-RAM), and 'disk' (Suspend-to-Disk). + * show() returns available sleep state labels, which may be "mem", "standby", + * "freeze" and "disk" (hibernation). See Documentation/power/states.txt for a + * description of what they mean. * - * store() accepts one of those strings, translates it into the - * proper enumerated value, and initiates a suspend transition. + * store() accepts one of those strings, translates it into the proper + * enumerated value, and initiates a suspend transition. */ static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 00aca60904b..338a6f14797 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -78,6 +78,26 @@ static bool valid_state(suspend_state_t state) return suspend_ops && suspend_ops->valid && suspend_ops->valid(state); } +/* + * If this is set, the "mem" label always corresponds to the deepest sleep state + * available, the "standby" label corresponds to the second deepest sleep state + * available (if any), and the "freeze" label corresponds to the remaining + * available sleep state (if there is one). + */ +static bool relative_states; + +static int __init sleep_states_setup(char *str) +{ + relative_states = !strncmp(str, "1", 1); + if (relative_states) { + pm_states[PM_SUSPEND_MEM].state = PM_SUSPEND_FREEZE; + pm_states[PM_SUSPEND_FREEZE].state = 0; + } + return 1; +} + +__setup("relative_sleep_states=", sleep_states_setup); + /** * suspend_set_ops - Set the global suspend method table. * @ops: Suspend operations to use. @@ -85,12 +105,20 @@ static bool valid_state(suspend_state_t state) void suspend_set_ops(const struct platform_suspend_ops *ops) { suspend_state_t i; + int j = PM_SUSPEND_MAX - 1; lock_system_sleep(); suspend_ops = ops; - for (i = PM_SUSPEND_STANDBY; i <= PM_SUSPEND_MEM; i++) - pm_states[i].state = valid_state(i) ? i : 0; + for (i = PM_SUSPEND_MEM; i >= PM_SUSPEND_STANDBY; i--) + if (valid_state(i)) + pm_states[j--].state = i; + else if (!relative_states) + pm_states[j--].state = 0; + + pm_states[j--].state = PM_SUSPEND_FREEZE; + while (j >= PM_SUSPEND_MIN) + pm_states[j--].state = 0; unlock_system_sleep(); } -- cgit v1.2.3-70-g09d2 From 4fc0a7e889e5540305926e41931cf3bc0a60abb2 Mon Sep 17 00:00:00 2001 From: Lv Zheng Date: Sat, 31 May 2014 08:15:02 +0800 Subject: ACPI: Fix x86 regression related to early mapping size limitation The following warning message is triggered: WARNING: CPU: 0 PID: 0 at mm/early_ioremap.c:136 __early_ioremap+0x11f/0x1f2() Modules linked in: CPU: 0 PID: 0 Comm: swapper Not tainted 3.15.0-rc1-00017-g86dfc6f3-dirty #298 Hardware name: Intel Corporation S2600CP/S2600CP, BIOS SE5C600.86B.99.99.x036.091920111209 09/19/2011 0000000000000009 ffffffff81b75c40 ffffffff817c627b 0000000000000000 ffffffff81b75c78 ffffffff81067b5d 000000000000007b 8000000000000563 00000000b96b20dc 0000000000000001 ffffffffff300e0c ffffffff81b75c88 Call Trace: [] dump_stack+0x45/0x56 [] warn_slowpath_common+0x7d/0xa0 [] warn_slowpath_null+0x1a/0x20 [] __early_ioremap+0x11f/0x1f2 [] early_ioremap+0x13/0x15 [] __acpi_map_table+0x13/0x18 [] acpi_os_map_memory+0x26/0x14e [] acpi_tb_acquire_table+0x42/0x70 [] acpi_tb_validate_table+0x27/0x37 [] acpi_tb_verify_table+0x22/0xd8 [] acpi_tb_install_non_fixed_table+0x60/0x1c9 [] acpi_tb_parse_root_table+0x218/0x26a [] ? early_idt_handlers+0x120/0x120 [] acpi_initialize_tables+0x57/0x59 [] acpi_table_init+0x1b/0x99 [] acpi_boot_table_init+0x1e/0x85 [] setup_arch+0x99d/0xcc6 [] ? early_idt_handlers+0x120/0x120 [] start_kernel+0x8b/0x415 [] ? early_idt_handlers+0x120/0x120 [] x86_64_start_reservations+0x2a/0x2c [] x86_64_start_kernel+0x13e/0x14d ---[ end trace 11ae599a1898f4e7 ]--- when installing the following table during early stage: ACPI: SSDT 0x00000000B9638018 07A0C4 (v02 INTEL S2600CP 00004000 INTL 20100331) The regression is caused by the size limitation of the x86 early IO mapping. The root cause is: 1. ACPICA doesn't split IO memory mapping and table mapping; 2. Linux x86 OSL implements acpi_os_map_memory() using a size limited fix-map mechanism during early boot stage, which is more suitable for only IO mappings. This patch fixes this issue by utilizing acpi_gbl_verify_table_checksum to disable the table mapping during early stage and enabling it again for the late stage. In this way, the normal code path is not affected. Then after the code related to the root cause is cleaned up, the early checksum verification can be easily re-enabled. A new boot parameter - acpi_force_table_verification is introduced for the platforms that require the checksum verification to stop loading bad tables. This fix also covers the checksum verification for the table overrides. Now large tables can also be overridden using the initrd override mechanism. Signed-off-by: Lv Zheng Reported-and-tested-by: Yuanhan Liu Signed-off-by: Rafael J. Wysocki --- Documentation/kernel-parameters.txt | 5 +++++ drivers/acpi/bus.c | 3 +++ drivers/acpi/tables.c | 23 +++++++++++++++++++++++ 3 files changed, 31 insertions(+) (limited to 'Documentation') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 60a27894865..4b7cc148143 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -214,6 +214,11 @@ bytes respectively. Such letter suffixes can also be entirely omitted. unusable. The "log_buf_len" parameter may be useful if you need to capture more output. + acpi_force_table_verification [HW,ACPI] + Enable table checksum verification during early stage. + By default, this is disabled due to x86 early mapping + size limitation. + acpi_irq_balance [HW,ACPI] ACPI will balance active IRQs default in APIC mode diff --git a/drivers/acpi/bus.c b/drivers/acpi/bus.c index cf925c4f36b..cf0b5ecf55b 100644 --- a/drivers/acpi/bus.c +++ b/drivers/acpi/bus.c @@ -466,6 +466,9 @@ void __init acpi_early_init(void) printk(KERN_INFO PREFIX "Core revision %08x\n", ACPI_CA_VERSION); + /* It's safe to verify table checksums during late stage */ + acpi_gbl_verify_table_checksum = TRUE; + /* enable workarounds, unless strict ACPI spec. compliance */ if (!acpi_strict) acpi_gbl_enable_interpreter_slack = TRUE; diff --git a/drivers/acpi/tables.c b/drivers/acpi/tables.c index 21782290df4..05550ba44d3 100644 --- a/drivers/acpi/tables.c +++ b/drivers/acpi/tables.c @@ -44,6 +44,12 @@ static struct acpi_table_desc initial_tables[ACPI_MAX_TABLES] __initdata; static int acpi_apic_instance __initdata; +/* + * Disable table checksum verification for the early stage due to the size + * limitation of the current x86 early mapping implementation. + */ +static bool acpi_verify_table_checksum __initdata = false; + void acpi_table_print_madt_entry(struct acpi_subtable_header *header) { if (!header) @@ -333,6 +339,14 @@ int __init acpi_table_init(void) { acpi_status status; + if (acpi_verify_table_checksum) { + pr_info("Early table checksum verification enabled\n"); + acpi_gbl_verify_table_checksum = TRUE; + } else { + pr_info("Early table checksum verification disabled\n"); + acpi_gbl_verify_table_checksum = FALSE; + } + status = acpi_initialize_tables(initial_tables, ACPI_MAX_TABLES, 0); if (ACPI_FAILURE(status)) return -EINVAL; @@ -354,3 +368,12 @@ static int __init acpi_parse_apic_instance(char *str) } early_param("acpi_apic_instance", acpi_parse_apic_instance); + +static int __init acpi_force_table_verification_setup(char *s) +{ + acpi_verify_table_checksum = true; + + return 0; +} + +early_param("acpi_force_table_verification", acpi_force_table_verification_setup); -- cgit v1.2.3-70-g09d2