From 8a35747a5d13b99e076b0222729e0caa48cb69b6 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Wed, 21 Jul 2010 21:44:31 +0000 Subject: macvtap: Limit packet queue length Mark Wagner reported OOM symptoms when sending UDP traffic over a macvtap link to a kvm receiver. This appears to be caused by the fact that macvtap packet queues are unlimited in length. This means that if the receiver can't keep up with the rate of flow, then we will hit OOM. Of course it gets worse if the OOM killer then decides to kill the receiver. This patch imposes a cap on the packet queue length, in the same way as the tuntap driver, using the device TX queue length. Please note that macvtap currently has no way of giving congestion notification, that means the software device TX queue cannot be used and packets will always be dropped once the macvtap driver queue fills up. This shouldn't be a great problem for the scenario where macvtap is used to feed a kvm receiver, as the traffic is most likely external in origin so congestion notification can't be applied anyway. Of course, if anybody decides to complain about guest-to-guest UDP packet loss down the track, then we may have to revisit this. Incidentally, this patch also fixes a real memory leak when macvtap_get_queue fails. Chris Wright noticed that for this patch to work, we need a non-zero TX queue length. This patch includes his work to change the default macvtap TX queue length to 500. Reported-by: Mark Wagner Signed-off-by: Herbert Xu Acked-by: Chris Wright Acked-by: Arnd Bergmann Signed-off-by: David S. Miller --- include/linux/if_macvlan.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/if_macvlan.h b/include/linux/if_macvlan.h index 9ea047aca79..1ffaeffeff7 100644 --- a/include/linux/if_macvlan.h +++ b/include/linux/if_macvlan.h @@ -67,6 +67,8 @@ static inline void macvlan_count_rx(const struct macvlan_dev *vlan, } } +extern void macvlan_common_setup(struct net_device *dev); + extern int macvlan_common_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], int (*receive)(struct sk_buff *skb), -- cgit v1.2.3-70-g09d2 From 72ad5d77fb981963edae15eee8196c80238f5ed0 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 23 Jul 2010 22:59:09 +0200 Subject: ACPI / Sleep: Allow the NVS saving to be skipped during suspend to RAM Commit 2a6b69765ad794389f2fc3e14a0afa1a995221c2 (ACPI: Store NVS state even when entering suspend to RAM) caused the ACPI suspend code save the NVS area during suspend and restore it during resume unconditionally, although it is known that some systems need to use acpi_sleep=s4_nonvs for hibernation to work. To allow the affected systems to avoid saving and restoring the NVS area during suspend to RAM and resume, introduce kernel command line option acpi_sleep=nonvs and make acpi_sleep=s4_nonvs work as its alias temporarily (add acpi_sleep=s4_nonvs to the feature removal file). Addresses https://bugzilla.kernel.org/show_bug.cgi?id=16396 . Signed-off-by: Rafael J. Wysocki Reported-and-tested-by: tomas m Signed-off-by: Len Brown --- Documentation/feature-removal-schedule.txt | 7 ++++++ Documentation/kernel-parameters.txt | 4 ++-- arch/x86/kernel/acpi/sleep.c | 9 ++++++-- drivers/acpi/sleep.c | 35 +++++++++++++++--------------- include/linux/acpi.h | 2 +- 5 files changed, 34 insertions(+), 23 deletions(-) (limited to 'include/linux') diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt index c268783bc4e..1571c0c83db 100644 --- a/Documentation/feature-removal-schedule.txt +++ b/Documentation/feature-removal-schedule.txt @@ -647,3 +647,10 @@ Who: Stefan Richter ---------------------------- +What: The acpi_sleep=s4_nonvs command line option +When: 2.6.37 +Files: arch/x86/kernel/acpi/sleep.c +Why: superseded by acpi_sleep=nonvs +Who: Rafael J. Wysocki + +---------------------------- diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 4ddb58df081..2b2407d9a6d 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -254,8 +254,8 @@ and is between 256 and 4096 characters. It is defined in the file control method, with respect to putting devices into low power states, to be enforced (the ACPI 2.0 ordering of _PTS is used by default). - s4_nonvs prevents the kernel from saving/restoring the - ACPI NVS memory during hibernation. + nonvs prevents the kernel from saving/restoring the + ACPI NVS memory during suspend/hibernation and resume. sci_force_enable causes the kernel to set SCI_EN directly on resume from S1/S3 (which is against the ACPI spec, but some broken systems don't work without it). diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index 82e508677b9..fcc3c61fdec 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -157,9 +157,14 @@ static int __init acpi_sleep_setup(char *str) #ifdef CONFIG_HIBERNATION if (strncmp(str, "s4_nohwsig", 10) == 0) acpi_no_s4_hw_signature(); - if (strncmp(str, "s4_nonvs", 8) == 0) - acpi_s4_no_nvs(); + if (strncmp(str, "s4_nonvs", 8) == 0) { + pr_warning("ACPI: acpi_sleep=s4_nonvs is deprecated, " + "please use acpi_sleep=nonvs instead"); + acpi_nvs_nosave(); + } #endif + if (strncmp(str, "nonvs", 5) == 0) + acpi_nvs_nosave(); if (strncmp(str, "old_ordering", 12) == 0) acpi_old_suspend_ordering(); str = strchr(str, ','); diff --git a/drivers/acpi/sleep.c b/drivers/acpi/sleep.c index 5b7c52e4a00..2862c781b37 100644 --- a/drivers/acpi/sleep.c +++ b/drivers/acpi/sleep.c @@ -81,6 +81,20 @@ static int acpi_sleep_prepare(u32 acpi_state) #ifdef CONFIG_ACPI_SLEEP static u32 acpi_target_sleep_state = ACPI_STATE_S0; +/* + * The ACPI specification wants us to save NVS memory regions during hibernation + * and to restore them during the subsequent resume. Windows does that also for + * suspend to RAM. However, it is known that this mechanism does not work on + * all machines, so we allow the user to disable it with the help of the + * 'acpi_sleep=nonvs' kernel command line option. + */ +static bool nvs_nosave; + +void __init acpi_nvs_nosave(void) +{ + nvs_nosave = true; +} + /* * ACPI 1.0 wants us to execute _PTS before suspending devices, so we allow the * user to request that behavior by using the 'acpi_old_suspend_ordering' @@ -197,8 +211,7 @@ static int acpi_suspend_begin(suspend_state_t pm_state) u32 acpi_state = acpi_suspend_states[pm_state]; int error = 0; - error = suspend_nvs_alloc(); - + error = nvs_nosave ? 0 : suspend_nvs_alloc(); if (error) return error; @@ -388,20 +401,6 @@ static struct dmi_system_id __initdata acpisleep_dmi_table[] = { #endif /* CONFIG_SUSPEND */ #ifdef CONFIG_HIBERNATION -/* - * The ACPI specification wants us to save NVS memory regions during hibernation - * and to restore them during the subsequent resume. However, it is not certain - * if this mechanism is going to work on all machines, so we allow the user to - * disable this mechanism using the 'acpi_sleep=s4_nonvs' kernel command line - * option. - */ -static bool s4_no_nvs; - -void __init acpi_s4_no_nvs(void) -{ - s4_no_nvs = true; -} - static unsigned long s4_hardware_signature; static struct acpi_table_facs *facs; static bool nosigcheck; @@ -415,7 +414,7 @@ static int acpi_hibernation_begin(void) { int error; - error = s4_no_nvs ? 0 : suspend_nvs_alloc(); + error = nvs_nosave ? 0 : suspend_nvs_alloc(); if (!error) { acpi_target_sleep_state = ACPI_STATE_S4; acpi_sleep_tts_switch(acpi_target_sleep_state); @@ -510,7 +509,7 @@ static int acpi_hibernation_begin_old(void) error = acpi_sleep_prepare(ACPI_STATE_S4); if (!error) { - if (!s4_no_nvs) + if (!nvs_nosave) error = suspend_nvs_alloc(); if (!error) acpi_target_sleep_state = ACPI_STATE_S4; diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 224a38c960d..ccf94dc5acd 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -253,7 +253,7 @@ int acpi_resources_are_enforced(void); #ifdef CONFIG_PM_SLEEP void __init acpi_no_s4_hw_signature(void); void __init acpi_old_suspend_ordering(void); -void __init acpi_s4_no_nvs(void); +void __init acpi_nvs_nosave(void); #endif /* CONFIG_PM_SLEEP */ struct acpi_osc_context { -- cgit v1.2.3-70-g09d2 From 7d14831e21060fbfbfe8453460ac19205f4ce1c2 Mon Sep 17 00:00:00 2001 From: Anuj Aggarwal Date: Mon, 12 Jul 2010 17:54:06 +0530 Subject: regulator: tps6507x: allow driver to use DEFDCDC{2,3}_HIGH register Acked-by: Mark Brown In TPS6507x, depending on the status of DEFDCDC{2,3} pin either DEFDCDC{2,3}_LOW or DEFDCDC{2,3}_HIGH register needs to be read or programmed to change the output voltage. The current driver assumes DEFDCDC{2,3} pins are always tied low and thus operates only on DEFDCDC{2,3}_LOW register. This need not always be the case (as is found on OMAP-L138 EVM). Unfortunately, software cannot read the status of DEFDCDC{2,3} pins. So, this information is passed through platform data depending on how the board is wired. Signed-off-by: Anuj Aggarwal Signed-off-by: Sekhar Nori Signed-off-by: Liam Girdwood --- drivers/regulator/tps6507x-regulator.c | 36 +++++++++++++++++++++++++++------- include/linux/regulator/tps6507x.h | 32 ++++++++++++++++++++++++++++++ 2 files changed, 61 insertions(+), 7 deletions(-) create mode 100644 include/linux/regulator/tps6507x.h (limited to 'include/linux') diff --git a/drivers/regulator/tps6507x-regulator.c b/drivers/regulator/tps6507x-regulator.c index 14b4576281c..8152d65220f 100644 --- a/drivers/regulator/tps6507x-regulator.c +++ b/drivers/regulator/tps6507x-regulator.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -101,9 +102,12 @@ struct tps_info { unsigned max_uV; u8 table_len; const u16 *table; + + /* Does DCDC high or the low register defines output voltage? */ + bool defdcdc_default; }; -static const struct tps_info tps6507x_pmic_regs[] = { +static struct tps_info tps6507x_pmic_regs[] = { { .name = "VDCDC1", .min_uV = 725000, @@ -145,7 +149,7 @@ struct tps6507x_pmic { struct regulator_desc desc[TPS6507X_NUM_REGULATOR]; struct tps6507x_dev *mfd; struct regulator_dev *rdev[TPS6507X_NUM_REGULATOR]; - const struct tps_info *info[TPS6507X_NUM_REGULATOR]; + struct tps_info *info[TPS6507X_NUM_REGULATOR]; struct mutex io_lock; }; static inline int tps6507x_pmic_read(struct tps6507x_pmic *tps, u8 reg) @@ -341,10 +345,16 @@ static int tps6507x_pmic_dcdc_get_voltage(struct regulator_dev *dev) reg = TPS6507X_REG_DEFDCDC1; break; case TPS6507X_DCDC_2: - reg = TPS6507X_REG_DEFDCDC2_LOW; + if (tps->info[dcdc]->defdcdc_default) + reg = TPS6507X_REG_DEFDCDC2_HIGH; + else + reg = TPS6507X_REG_DEFDCDC2_LOW; break; case TPS6507X_DCDC_3: - reg = TPS6507X_REG_DEFDCDC3_LOW; + if (tps->info[dcdc]->defdcdc_default) + reg = TPS6507X_REG_DEFDCDC3_HIGH; + else + reg = TPS6507X_REG_DEFDCDC3_LOW; break; default: return -EINVAL; @@ -370,10 +380,16 @@ static int tps6507x_pmic_dcdc_set_voltage(struct regulator_dev *dev, reg = TPS6507X_REG_DEFDCDC1; break; case TPS6507X_DCDC_2: - reg = TPS6507X_REG_DEFDCDC2_LOW; + if (tps->info[dcdc]->defdcdc_default) + reg = TPS6507X_REG_DEFDCDC2_HIGH; + else + reg = TPS6507X_REG_DEFDCDC2_LOW; break; case TPS6507X_DCDC_3: - reg = TPS6507X_REG_DEFDCDC3_LOW; + if (tps->info[dcdc]->defdcdc_default) + reg = TPS6507X_REG_DEFDCDC3_HIGH; + else + reg = TPS6507X_REG_DEFDCDC3_LOW; break; default: return -EINVAL; @@ -532,7 +548,7 @@ int tps6507x_pmic_probe(struct platform_device *pdev) { struct tps6507x_dev *tps6507x_dev = dev_get_drvdata(pdev->dev.parent); static int desc_id; - const struct tps_info *info = &tps6507x_pmic_regs[0]; + struct tps_info *info = &tps6507x_pmic_regs[0]; struct regulator_init_data *init_data; struct regulator_dev *rdev; struct tps6507x_pmic *tps; @@ -569,6 +585,12 @@ int tps6507x_pmic_probe(struct platform_device *pdev) for (i = 0; i < TPS6507X_NUM_REGULATOR; i++, info++, init_data++) { /* Register the regulators */ tps->info[i] = info; + if (init_data->driver_data) { + struct tps6507x_reg_platform_data *data = + init_data->driver_data; + tps->info[i]->defdcdc_default = data->defdcdc_default; + } + tps->desc[i].name = info->name; tps->desc[i].id = desc_id++; tps->desc[i].n_voltages = num_voltages[i]; diff --git a/include/linux/regulator/tps6507x.h b/include/linux/regulator/tps6507x.h new file mode 100644 index 00000000000..4892f591bab --- /dev/null +++ b/include/linux/regulator/tps6507x.h @@ -0,0 +1,32 @@ +/* + * tps6507x.h -- Voltage regulation for the Texas Instruments TPS6507X + * + * Copyright (C) 2010 Texas Instruments, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef REGULATOR_TPS6507X +#define REGULATOR_TPS6507X + +/** + * tps6507x_reg_platform_data - platform data for tps6507x + * @defdcdc_default: Defines whether DCDC high or the low register controls + * output voltage by default. Valid for DCDC2 and DCDC3 outputs only. + */ +struct tps6507x_reg_platform_data { + bool defdcdc_default; +}; + +#endif -- cgit v1.2.3-70-g09d2 From de09a9771a5346029f4d11e4ac886be7f9bfdd75 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 29 Jul 2010 12:45:49 +0100 Subject: CRED: Fix get_task_cred() and task_state() to not resurrect dead credentials It's possible for get_task_cred() as it currently stands to 'corrupt' a set of credentials by incrementing their usage count after their replacement by the task being accessed. What happens is that get_task_cred() can race with commit_creds(): TASK_1 TASK_2 RCU_CLEANER -->get_task_cred(TASK_2) rcu_read_lock() __cred = __task_cred(TASK_2) -->commit_creds() old_cred = TASK_2->real_cred TASK_2->real_cred = ... put_cred(old_cred) call_rcu(old_cred) [__cred->usage == 0] get_cred(__cred) [__cred->usage == 1] rcu_read_unlock() -->put_cred_rcu() [__cred->usage == 1] panic() However, since a tasks credentials are generally not changed very often, we can reasonably make use of a loop involving reading the creds pointer and using atomic_inc_not_zero() to attempt to increment it if it hasn't already hit zero. If successful, we can safely return the credentials in the knowledge that, even if the task we're accessing has released them, they haven't gone to the RCU cleanup code. We then change task_state() in procfs to use get_task_cred() rather than calling get_cred() on the result of __task_cred(), as that suffers from the same problem. Without this change, a BUG_ON in __put_cred() or in put_cred_rcu() can be tripped when it is noticed that the usage count is not zero as it ought to be, for example: kernel BUG at kernel/cred.c:168! invalid opcode: 0000 [#1] SMP last sysfs file: /sys/kernel/mm/ksm/run CPU 0 Pid: 2436, comm: master Not tainted 2.6.33.3-85.fc13.x86_64 #1 0HR330/OptiPlex 745 RIP: 0010:[] [] __put_cred+0xc/0x45 RSP: 0018:ffff88019e7e9eb8 EFLAGS: 00010202 RAX: 0000000000000001 RBX: ffff880161514480 RCX: 00000000ffffffff RDX: 00000000ffffffff RSI: ffff880140c690c0 RDI: ffff880140c690c0 RBP: ffff88019e7e9eb8 R08: 00000000000000d0 R09: 0000000000000000 R10: 0000000000000001 R11: 0000000000000040 R12: ffff880140c690c0 R13: ffff88019e77aea0 R14: 00007fff336b0a5c R15: 0000000000000001 FS: 00007f12f50d97c0(0000) GS:ffff880007400000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f8f461bc000 CR3: 00000001b26ce000 CR4: 00000000000006f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process master (pid: 2436, threadinfo ffff88019e7e8000, task ffff88019e77aea0) Stack: ffff88019e7e9ec8 ffffffff810698cd ffff88019e7e9ef8 ffffffff81069b45 <0> ffff880161514180 ffff880161514480 ffff880161514180 0000000000000000 <0> ffff88019e7e9f28 ffffffff8106aace 0000000000000001 0000000000000246 Call Trace: [] put_cred+0x13/0x15 [] commit_creds+0x16b/0x175 [] set_current_groups+0x47/0x4e [] sys_setgroups+0xf6/0x105 [] system_call_fastpath+0x16/0x1b Code: 48 8d 71 ff e8 7e 4e 15 00 85 c0 78 0b 8b 75 ec 48 89 df e8 ef 4a 15 00 48 83 c4 18 5b c9 c3 55 8b 07 8b 07 48 89 e5 85 c0 74 04 <0f> 0b eb fe 65 48 8b 04 25 00 cc 00 00 48 3b b8 58 04 00 00 75 RIP [] __put_cred+0xc/0x45 RSP ---[ end trace df391256a100ebdd ]--- Signed-off-by: David Howells Acked-by: Jiri Olsa Signed-off-by: Linus Torvalds --- fs/proc/array.c | 2 +- include/linux/cred.h | 21 +-------------------- kernel/cred.c | 25 +++++++++++++++++++++++++ 3 files changed, 27 insertions(+), 21 deletions(-) (limited to 'include/linux') diff --git a/fs/proc/array.c b/fs/proc/array.c index 9b58d38bc91..fff6572676a 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -176,7 +176,7 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns, if (tracer) tpid = task_pid_nr_ns(tracer, ns); } - cred = get_cred((struct cred *) __task_cred(p)); + cred = get_task_cred(p); seq_printf(m, "State:\t%s\n" "Tgid:\t%d\n" diff --git a/include/linux/cred.h b/include/linux/cred.h index 75c0fa88130..ce40cbc791e 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -153,6 +153,7 @@ struct cred { extern void __put_cred(struct cred *); extern void exit_creds(struct task_struct *); extern int copy_creds(struct task_struct *, unsigned long); +extern const struct cred *get_task_cred(struct task_struct *); extern struct cred *cred_alloc_blank(void); extern struct cred *prepare_creds(void); extern struct cred *prepare_exec_creds(void); @@ -281,26 +282,6 @@ static inline void put_cred(const struct cred *_cred) #define __task_cred(task) \ ((const struct cred *)(rcu_dereference_check((task)->real_cred, rcu_read_lock_held() || lockdep_tasklist_lock_is_held()))) -/** - * get_task_cred - Get another task's objective credentials - * @task: The task to query - * - * Get the objective credentials of a task, pinning them so that they can't go - * away. Accessing a task's credentials directly is not permitted. - * - * The caller must make sure task doesn't go away, either by holding a ref on - * task or by holding tasklist_lock to prevent it from being unlinked. - */ -#define get_task_cred(task) \ -({ \ - struct cred *__cred; \ - rcu_read_lock(); \ - __cred = (struct cred *) __task_cred((task)); \ - get_cred(__cred); \ - rcu_read_unlock(); \ - __cred; \ -}) - /** * get_current_cred - Get the current task's subjective credentials * diff --git a/kernel/cred.c b/kernel/cred.c index a2d5504fbcc..60bc8b1e32e 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -209,6 +209,31 @@ void exit_creds(struct task_struct *tsk) } } +/** + * get_task_cred - Get another task's objective credentials + * @task: The task to query + * + * Get the objective credentials of a task, pinning them so that they can't go + * away. Accessing a task's credentials directly is not permitted. + * + * The caller must also make sure task doesn't get deleted, either by holding a + * ref on task or by holding tasklist_lock to prevent it from being unlinked. + */ +const struct cred *get_task_cred(struct task_struct *task) +{ + const struct cred *cred; + + rcu_read_lock(); + + do { + cred = __task_cred((task)); + BUG_ON(!cred); + } while (!atomic_inc_not_zero(&((struct cred *)cred)->usage)); + + rcu_read_unlock(); + return cred; +} + /* * Allocate blank credentials, such that the credentials can be filled in at a * later date without risk of ENOMEM. -- cgit v1.2.3-70-g09d2 From 8f92054e7ca1d3a3ae50fb42d2253ac8730d9b2a Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 29 Jul 2010 12:45:55 +0100 Subject: CRED: Fix __task_cred()'s lockdep check and banner comment Fix __task_cred()'s lockdep check by removing the following validation condition: lockdep_tasklist_lock_is_held() as commit_creds() does not take the tasklist_lock, and nor do most of the functions that call it, so this check is pointless and it can prevent detection of the RCU lock not being held if the tasklist_lock is held. Instead, add the following validation condition: task->exit_state >= 0 to permit the access if the target task is dead and therefore unable to change its own credentials. Fix __task_cred()'s comment to: (1) discard the bit that says that the caller must prevent the target task from being deleted. That shouldn't need saying. (2) Add a comment indicating the result of __task_cred() should not be passed directly to get_cred(), but rather than get_task_cred() should be used instead. Also put a note into the documentation to enforce this point there too. Signed-off-by: David Howells Acked-by: Jiri Olsa Cc: Paul E. McKenney Signed-off-by: Linus Torvalds --- Documentation/credentials.txt | 3 +++ include/linux/cred.h | 15 ++++++++++----- include/linux/sched.h | 1 + 3 files changed, 14 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/Documentation/credentials.txt b/Documentation/credentials.txt index a2db3528700..995baf379c0 100644 --- a/Documentation/credentials.txt +++ b/Documentation/credentials.txt @@ -417,6 +417,9 @@ reference on them using: This does all the RCU magic inside of it. The caller must call put_cred() on the credentials so obtained when they're finished with. + [*] Note: The result of __task_cred() should not be passed directly to + get_cred() as this may race with commit_cred(). + There are a couple of convenience functions to access bits of another task's credentials, hiding the RCU magic from the caller: diff --git a/include/linux/cred.h b/include/linux/cred.h index ce40cbc791e..4d2c39573f3 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -274,13 +274,18 @@ static inline void put_cred(const struct cred *_cred) * @task: The task to query * * Access the objective credentials of a task. The caller must hold the RCU - * readlock. + * readlock or the task must be dead and unable to change its own credentials. * - * The caller must make sure task doesn't go away, either by holding a ref on - * task or by holding tasklist_lock to prevent it from being unlinked. + * The result of this function should not be passed directly to get_cred(); + * rather get_task_cred() should be used instead. */ -#define __task_cred(task) \ - ((const struct cred *)(rcu_dereference_check((task)->real_cred, rcu_read_lock_held() || lockdep_tasklist_lock_is_held()))) +#define __task_cred(task) \ + ({ \ + const struct task_struct *__t = (task); \ + rcu_dereference_check(__t->real_cred, \ + rcu_read_lock_held() || \ + task_is_dead(__t)); \ + }) /** * get_current_cred - Get the current task's subjective credentials diff --git a/include/linux/sched.h b/include/linux/sched.h index 747fcaedddb..0478888c689 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -214,6 +214,7 @@ extern char ___assert_task_state[1 - 2*!!( #define task_is_traced(task) ((task->state & __TASK_TRACED) != 0) #define task_is_stopped(task) ((task->state & __TASK_STOPPED) != 0) +#define task_is_dead(task) ((task)->exit_state != 0) #define task_is_stopped_or_traced(task) \ ((task->state & (__TASK_STOPPED | __TASK_TRACED)) != 0) #define task_contributes_to_load(task) \ -- cgit v1.2.3-70-g09d2 From b608b283a962caaa280756bc8563016a71712acf Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 30 Jul 2010 15:31:54 -0400 Subject: NFS: kswapd must not block in nfs_release_page See https://bugzilla.kernel.org/show_bug.cgi?id=16056 If other processes are blocked waiting for kswapd to free up some memory so that they can make progress, then we cannot allow kswapd to block on those processes. Signed-off-by: Trond Myklebust Cc: stable@kernel.org --- fs/nfs/file.c | 13 +++++++++++-- fs/nfs/write.c | 4 ++-- include/linux/nfs_fs.h | 1 + 3 files changed, 14 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 36a5e74f51b..f036153d9f5 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include @@ -493,11 +494,19 @@ static void nfs_invalidate_page(struct page *page, unsigned long offset) */ static int nfs_release_page(struct page *page, gfp_t gfp) { + struct address_space *mapping = page->mapping; + dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page); /* Only do I/O if gfp is a superset of GFP_KERNEL */ - if ((gfp & GFP_KERNEL) == GFP_KERNEL) - nfs_wb_page(page->mapping->host, page); + if (mapping && (gfp & GFP_KERNEL) == GFP_KERNEL) { + int how = FLUSH_SYNC; + + /* Don't let kswapd deadlock waiting for OOM RPC calls */ + if (current_is_kswapd()) + how = 0; + nfs_commit_inode(mapping->host, how); + } /* If PagePrivate() is set, then the page is not freeable */ if (PagePrivate(page)) return 0; diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 91679e2631e..0a6c65a1f9d 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1379,7 +1379,7 @@ static const struct rpc_call_ops nfs_commit_ops = { .rpc_release = nfs_commit_release, }; -static int nfs_commit_inode(struct inode *inode, int how) +int nfs_commit_inode(struct inode *inode, int how) { LIST_HEAD(head); int may_wait = how & FLUSH_SYNC; @@ -1443,7 +1443,7 @@ out_mark_dirty: return ret; } #else -static int nfs_commit_inode(struct inode *inode, int how) +int nfs_commit_inode(struct inode *inode, int how) { return 0; } diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 77c2ae53431..f6e2455f13d 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -493,6 +493,7 @@ extern int nfs_wb_all(struct inode *inode); extern int nfs_wb_page(struct inode *inode, struct page* page); extern int nfs_wb_page_cancel(struct inode *inode, struct page* page); #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) +extern int nfs_commit_inode(struct inode *, int); extern struct nfs_write_data *nfs_commitdata_alloc(void); extern void nfs_commit_free(struct nfs_write_data *wdata); #endif -- cgit v1.2.3-70-g09d2 From 77a63f3d1e0a3e7ede8d10f569e8481b13ff47c5 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 1 Aug 2010 13:40:40 -0400 Subject: NFS: Fix a typo in include/linux/nfs_fs.h nfs_commit_inode() needs to be defined irrespectively of whether or not we are supporting NFSv3 and NFSv4. Allow the compiler to optimise away code in the NFSv2-only case by converting it into an inlined stub function. Reported-and-tested-by: Ingo Molnar Signed-off-by: Trond Myklebust Signed-off-by: Linus Torvalds --- fs/nfs/write.c | 5 ----- include/linux/nfs_fs.h | 6 ++++++ 2 files changed, 6 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/fs/nfs/write.c b/fs/nfs/write.c index bb72ad34d51..9f81bdd91c5 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1454,11 +1454,6 @@ out_mark_dirty: return ret; } #else -int nfs_commit_inode(struct inode *inode, int how) -{ - return 0; -} - static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_control *wbc) { return 0; diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index f6e2455f13d..bad4d121b16 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -496,6 +496,12 @@ extern int nfs_wb_page_cancel(struct inode *inode, struct page* page); extern int nfs_commit_inode(struct inode *, int); extern struct nfs_write_data *nfs_commitdata_alloc(void); extern void nfs_commit_free(struct nfs_write_data *wdata); +#else +static inline int +nfs_commit_inode(struct inode *inode, int how) +{ + return 0; +} #endif static inline int -- cgit v1.2.3-70-g09d2