From 72d0d248ca8232dbd30d35b42d0d86e39b3e322b Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Mon, 16 Jul 2012 15:23:48 -0400 Subject: fuse: add FUSE_AUTO_INVAL_DATA init flag FUSE_AUTO_INVAL_DATA is provided to enable updated/auto cache invalidation logic. Signed-off-by: Brian Foster Signed-off-by: Miklos Szeredi --- include/linux/fuse.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/fuse.h b/include/linux/fuse.h index 9303348965f..e4a9d2af9aa 100644 --- a/include/linux/fuse.h +++ b/include/linux/fuse.h @@ -57,6 +57,9 @@ * * 7.19 * - add FUSE_FALLOCATE + * + * 7.20 + * - add FUSE_AUTO_INVAL_DATA */ #ifndef _LINUX_FUSE_H @@ -88,7 +91,7 @@ #define FUSE_KERNEL_VERSION 7 /** Minor version number of this interface */ -#define FUSE_KERNEL_MINOR_VERSION 19 +#define FUSE_KERNEL_MINOR_VERSION 20 /** The node ID of the root inode */ #define FUSE_ROOT_ID 1 @@ -167,6 +170,7 @@ struct fuse_file_lock { * FUSE_EXPORT_SUPPORT: filesystem handles lookups of "." and ".." * FUSE_DONT_MASK: don't apply umask to file mode on create operations * FUSE_FLOCK_LOCKS: remote locking for BSD style file locks + * FUSE_AUTO_INVAL_DATA: automatically invalidate cached pages */ #define FUSE_ASYNC_READ (1 << 0) #define FUSE_POSIX_LOCKS (1 << 1) @@ -176,6 +180,7 @@ struct fuse_file_lock { #define FUSE_BIG_WRITES (1 << 5) #define FUSE_DONT_MASK (1 << 6) #define FUSE_FLOCK_LOCKS (1 << 10) +#define FUSE_AUTO_INVAL_DATA (1 << 12) /** * CUSE INIT request/reply flags -- cgit v1.2.3-70-g09d2 From 69fe05c90ed58aac956dccb9e6d3a325fb3b8767 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 18 Jul 2012 16:09:40 +0200 Subject: fuse: add missing INIT flags Add missing flags that userspace derived from the protocol version number. This makes the protocol more flexible. Signed-off-by: Miklos Szeredi --- fs/fuse/inode.c | 3 ++- include/linux/fuse.h | 8 ++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index dd4401650b4..ce0a2838ccd 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -882,7 +882,8 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req) arg->max_readahead = fc->bdi.ra_pages * PAGE_CACHE_SIZE; arg->flags |= FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC | FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES | FUSE_DONT_MASK | - FUSE_FLOCK_LOCKS | FUSE_AUTO_INVAL_DATA; + FUSE_SPLICE_WRITE | FUSE_SPLICE_MOVE | FUSE_SPLICE_READ | + FUSE_FLOCK_LOCKS | FUSE_IOCTL_DIR | FUSE_AUTO_INVAL_DATA; req->in.h.opcode = FUSE_INIT; req->in.numargs = 1; req->in.args[0].size = sizeof(*arg); diff --git a/include/linux/fuse.h b/include/linux/fuse.h index e4a9d2af9aa..6455c5b64c2 100644 --- a/include/linux/fuse.h +++ b/include/linux/fuse.h @@ -169,7 +169,11 @@ struct fuse_file_lock { * FUSE_POSIX_LOCKS: remote locking for POSIX file locks * FUSE_EXPORT_SUPPORT: filesystem handles lookups of "." and ".." * FUSE_DONT_MASK: don't apply umask to file mode on create operations + * FUSE_SPLICE_WRITE: kernel supports splice write on the device + * FUSE_SPLICE_MOVE: kernel supports splice move on the device + * FUSE_SPLICE_READ: kernel supports splice read on the device * FUSE_FLOCK_LOCKS: remote locking for BSD style file locks + * FUSE_HAS_IOCTL_DIR: kernel supports ioctl on directories * FUSE_AUTO_INVAL_DATA: automatically invalidate cached pages */ #define FUSE_ASYNC_READ (1 << 0) @@ -179,7 +183,11 @@ struct fuse_file_lock { #define FUSE_EXPORT_SUPPORT (1 << 4) #define FUSE_BIG_WRITES (1 << 5) #define FUSE_DONT_MASK (1 << 6) +#define FUSE_SPLICE_WRITE (1 << 7) +#define FUSE_SPLICE_MOVE (1 << 8) +#define FUSE_SPLICE_READ (1 << 9) #define FUSE_FLOCK_LOCKS (1 << 10) +#define FUSE_HAS_IOCTL_DIR (1 << 11) #define FUSE_AUTO_INVAL_DATA (1 << 12) /** -- cgit v1.2.3-70-g09d2 From f3840dc0fb57aef120c5ee8241cdc9aaf3cec8d4 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 18 Jul 2012 16:09:40 +0200 Subject: fuse: add missing INIT flag descriptions Signed-off-by: Miklos Szeredi --- include/linux/fuse.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/fuse.h b/include/linux/fuse.h index 6455c5b64c2..d8c713e148e 100644 --- a/include/linux/fuse.h +++ b/include/linux/fuse.h @@ -166,8 +166,12 @@ struct fuse_file_lock { /** * INIT request/reply flags * + * FUSE_ASYNC_READ: asynchronous read requests * FUSE_POSIX_LOCKS: remote locking for POSIX file locks + * FUSE_FILE_OPS: kernel sends file handle for fstat, etc... (not yet supported) + * FUSE_ATOMIC_O_TRUNC: handles the O_TRUNC open flag in the filesystem * FUSE_EXPORT_SUPPORT: filesystem handles lookups of "." and ".." + * FUSE_BIG_WRITES: filesystem can handle write size larger than 4kB * FUSE_DONT_MASK: don't apply umask to file mode on create operations * FUSE_SPLICE_WRITE: kernel supports splice write on the device * FUSE_SPLICE_MOVE: kernel supports splice move on the device -- cgit v1.2.3-70-g09d2 From dc9b229a58dc0dfed34272ff26c6d5fd17c674e0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 13 Jul 2012 19:29:45 +0200 Subject: genirq: Allow irq chips to mark themself oneshot safe Some interrupt chips like MSI are oneshot safe by implementation. For those interrupts we can avoid the mask/unmask sequence for threaded interrupt handlers. Signed-off-by: Thomas Gleixner Link: http://lkml.kernel.org/r/alpine.LFD.2.02.1207132056540.32033@ionos Cc: Linus Torvalds Cc: Avi Kivity Cc: Marcelo Tosatti Cc: Jan Kiszka --- include/linux/irq.h | 1 + kernel/irq/manage.c | 15 ++++++++++++++- 2 files changed, 15 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/irq.h b/include/linux/irq.h index 553fb66da13..216b0ba109d 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -349,6 +349,7 @@ enum { IRQCHIP_MASK_ON_SUSPEND = (1 << 2), IRQCHIP_ONOFFLINE_ENABLED = (1 << 3), IRQCHIP_SKIP_SET_WAKE = (1 << 4), + IRQCHIP_ONESHOT_SAFE = (1 << 5), }; /* This include will go away once we isolated irq_desc usage to core code */ diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 8c548232ba3..2e326d1ebec 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -959,6 +959,18 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) goto out_thread; } + /* + * Drivers are often written to work w/o knowledge about the + * underlying irq chip implementation, so a request for a + * threaded irq without a primary hard irq context handler + * requires the ONESHOT flag to be set. Some irq chips like + * MSI based interrupts are per se one shot safe. Check the + * chip flags, so we can avoid the unmask dance at the end of + * the threaded handler for those. + */ + if (desc->irq_data.chip->flags & IRQCHIP_ONESHOT_SAFE) + new->flags &= ~IRQF_ONESHOT; + /* * The following block of code has to be executed atomically */ @@ -1033,7 +1045,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) */ new->thread_mask = 1 << ffz(thread_mask); - } else if (new->handler == irq_default_primary_handler) { + } else if (new->handler == irq_default_primary_handler && + !(desc->irq_data.chip->flags & IRQCHIP_ONESHOT_SAFE)) { /* * The interrupt was requested with handler = NULL, so * we use the default primary handler for it. But it -- cgit v1.2.3-70-g09d2 From 6956dc568f34107f1d02b24f87efe7250803fc87 Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Fri, 20 Jul 2012 14:19:50 +0800 Subject: sched/numa: Add SD_PERFER_SIBLING to CPU domain Commit 8e7fbcbc22c ("sched: Remove stale power aware scheduling remnants and dysfunctional knobs") removed SD_PERFER_SIBLING from the CPU domain. On NUMA machines this causes that load_balance() doesn't perfer LCPU in same physical CPU package. It causes some actual performance regressions on our NUMA machines from Core2 to NHM and SNB. Adding this domain flag again recovers the performance drop. This change doesn't have any bad impact on any of my benchmarks: specjbb, kbuild, fio, hackbench .. etc, on all my machines. Signed-off-by: Alex Shi Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1342765190-21540-1-git-send-email-alex.shi@intel.com Signed-off-by: Ingo Molnar --- include/linux/topology.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/topology.h b/include/linux/topology.h index e91cd43394d..fec12d66721 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -164,6 +164,7 @@ int arch_update_cpu_topology(void); | 0*SD_SHARE_CPUPOWER \ | 0*SD_SHARE_PKG_RESOURCES \ | 0*SD_SERIALIZE \ + | 1*SD_PREFER_SIBLING \ , \ .last_balance = jiffies, \ .balance_interval = 1, \ -- cgit v1.2.3-70-g09d2 From a7e4786b937a3ae918a7520cfdba557a80915fa7 Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Sat, 21 Jul 2012 00:54:59 +0530 Subject: sched: Fix comment about PREEMPT_ACTIVE bit location PREEMPT_ACTIVE flag is bit 27, not 28. Fix the comment. Signed-off-by: Srivatsa S. Bhat Cc: paulmck@linux.vnet.ibm.com Cc: josh@joshtriplett.org Cc: rostedt@goodmis.org Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20120720192459.6149.14821.stgit@srivatsabhat.in.ibm.com Signed-off-by: Ingo Molnar --- include/linux/hardirq.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h index bb7f3097185..305f23cd7cf 100644 --- a/include/linux/hardirq.h +++ b/include/linux/hardirq.h @@ -22,7 +22,7 @@ * * - bits 16-25 are the hardirq count (max # of nested hardirqs: 1024) * - bit 26 is the NMI_MASK - * - bit 28 is the PREEMPT_ACTIVE flag + * - bit 27 is the PREEMPT_ACTIVE flag * * PREEMPT_MASK: 0x000000ff * SOFTIRQ_MASK: 0x0000ff00 -- cgit v1.2.3-70-g09d2 From 3f6f49c7854c9294119437a82c5b35be78f9cea6 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Thu, 26 Jul 2012 20:08:54 -0400 Subject: ACPI: delete _GTS/_BFS support _GTS and _BFS were added to the suspend/resume flow in the ACPI 2.0 specification. Linux dutifully implemented _GTS and _BFS. We discovered that it was rarely seen in systems in the field. Further, some of those systems had AML so bogus that it could never work -- proof that no other operating system supports _GTS and _BFS. So we made _GTS and _BFS optional via modparam, and disabled them by default. But we've had to complicate some code to keep this support in the kernel, as these methods are defined to be evaluated very close to sleep entry and exit. Indeed, no other AML is ever evaluated with interrupts off. We have submitted a proposal for _GTS and _BFS to be officially removed from the ACPI specification on the next revision. Here we remove it from Linux. Signed-off-by: Len Brown Acked-by: Ingo Molnar Acked-by: Konrad Rzeszutek Wilk --- drivers/acpi/acpica/achware.h | 12 +++---- drivers/acpi/acpica/hwesleep.c | 19 ++--------- drivers/acpi/acpica/hwsleep.c | 20 ++--------- drivers/acpi/acpica/hwxfsleep.c | 22 ++++++------- drivers/acpi/sleep.c | 73 +++++------------------------------------ include/acpi/acpixf.h | 4 +-- include/acpi/actypes.h | 2 +- 7 files changed, 34 insertions(+), 118 deletions(-) (limited to 'include') diff --git a/drivers/acpi/acpica/achware.h b/drivers/acpi/acpica/achware.h index 5ccb99ae3a6..5de4ec72766 100644 --- a/drivers/acpi/acpica/achware.h +++ b/drivers/acpi/acpica/achware.h @@ -83,22 +83,22 @@ acpi_status acpi_hw_clear_acpi_status(void); /* * hwsleep - sleep/wake support (Legacy sleep registers) */ -acpi_status acpi_hw_legacy_sleep(u8 sleep_state, u8 flags); +acpi_status acpi_hw_legacy_sleep(u8 sleep_state); -acpi_status acpi_hw_legacy_wake_prep(u8 sleep_state, u8 flags); +acpi_status acpi_hw_legacy_wake_prep(u8 sleep_state); -acpi_status acpi_hw_legacy_wake(u8 sleep_state, u8 flags); +acpi_status acpi_hw_legacy_wake(u8 sleep_state); /* * hwesleep - sleep/wake support (Extended FADT-V5 sleep registers) */ void acpi_hw_execute_sleep_method(char *method_name, u32 integer_argument); -acpi_status acpi_hw_extended_sleep(u8 sleep_state, u8 flags); +acpi_status acpi_hw_extended_sleep(u8 sleep_state); -acpi_status acpi_hw_extended_wake_prep(u8 sleep_state, u8 flags); +acpi_status acpi_hw_extended_wake_prep(u8 sleep_state); -acpi_status acpi_hw_extended_wake(u8 sleep_state, u8 flags); +acpi_status acpi_hw_extended_wake(u8 sleep_state); /* * hwvalid - Port I/O with validation diff --git a/drivers/acpi/acpica/hwesleep.c b/drivers/acpi/acpica/hwesleep.c index 48518dac534..94996f9ae3a 100644 --- a/drivers/acpi/acpica/hwesleep.c +++ b/drivers/acpi/acpica/hwesleep.c @@ -90,7 +90,6 @@ void acpi_hw_execute_sleep_method(char *method_pathname, u32 integer_argument) * FUNCTION: acpi_hw_extended_sleep * * PARAMETERS: sleep_state - Which sleep state to enter - * flags - ACPI_EXECUTE_GTS to run optional method * * RETURN: Status * @@ -100,7 +99,7 @@ void acpi_hw_execute_sleep_method(char *method_pathname, u32 integer_argument) * ******************************************************************************/ -acpi_status acpi_hw_extended_sleep(u8 sleep_state, u8 flags) +acpi_status acpi_hw_extended_sleep(u8 sleep_state) { acpi_status status; u8 sleep_type_value; @@ -125,12 +124,6 @@ acpi_status acpi_hw_extended_sleep(u8 sleep_state, u8 flags) acpi_gbl_system_awake_and_running = FALSE; - /* Optionally execute _GTS (Going To Sleep) */ - - if (flags & ACPI_EXECUTE_GTS) { - acpi_hw_execute_sleep_method(METHOD_PATHNAME__GTS, sleep_state); - } - /* Flush caches, as per ACPI specification */ ACPI_FLUSH_CPU_CACHE(); @@ -172,7 +165,6 @@ acpi_status acpi_hw_extended_sleep(u8 sleep_state, u8 flags) * FUNCTION: acpi_hw_extended_wake_prep * * PARAMETERS: sleep_state - Which sleep state we just exited - * flags - ACPI_EXECUTE_BFS to run optional method * * RETURN: Status * @@ -181,7 +173,7 @@ acpi_status acpi_hw_extended_sleep(u8 sleep_state, u8 flags) * ******************************************************************************/ -acpi_status acpi_hw_extended_wake_prep(u8 sleep_state, u8 flags) +acpi_status acpi_hw_extended_wake_prep(u8 sleep_state) { acpi_status status; u8 sleep_type_value; @@ -200,11 +192,6 @@ acpi_status acpi_hw_extended_wake_prep(u8 sleep_state, u8 flags) &acpi_gbl_FADT.sleep_control); } - /* Optionally execute _BFS (Back From Sleep) */ - - if (flags & ACPI_EXECUTE_BFS) { - acpi_hw_execute_sleep_method(METHOD_PATHNAME__BFS, sleep_state); - } return_ACPI_STATUS(AE_OK); } @@ -222,7 +209,7 @@ acpi_status acpi_hw_extended_wake_prep(u8 sleep_state, u8 flags) * ******************************************************************************/ -acpi_status acpi_hw_extended_wake(u8 sleep_state, u8 flags) +acpi_status acpi_hw_extended_wake(u8 sleep_state) { ACPI_FUNCTION_TRACE(hw_extended_wake); diff --git a/drivers/acpi/acpica/hwsleep.c b/drivers/acpi/acpica/hwsleep.c index 9960fe9ef53..3fddde056a5 100644 --- a/drivers/acpi/acpica/hwsleep.c +++ b/drivers/acpi/acpica/hwsleep.c @@ -56,7 +56,6 @@ ACPI_MODULE_NAME("hwsleep") * FUNCTION: acpi_hw_legacy_sleep * * PARAMETERS: sleep_state - Which sleep state to enter - * flags - ACPI_EXECUTE_GTS to run optional method * * RETURN: Status * @@ -64,7 +63,7 @@ ACPI_MODULE_NAME("hwsleep") * THIS FUNCTION MUST BE CALLED WITH INTERRUPTS DISABLED * ******************************************************************************/ -acpi_status acpi_hw_legacy_sleep(u8 sleep_state, u8 flags) +acpi_status acpi_hw_legacy_sleep(u8 sleep_state) { struct acpi_bit_register_info *sleep_type_reg_info; struct acpi_bit_register_info *sleep_enable_reg_info; @@ -110,12 +109,6 @@ acpi_status acpi_hw_legacy_sleep(u8 sleep_state, u8 flags) return_ACPI_STATUS(status); } - /* Optionally execute _GTS (Going To Sleep) */ - - if (flags & ACPI_EXECUTE_GTS) { - acpi_hw_execute_sleep_method(METHOD_PATHNAME__GTS, sleep_state); - } - /* Get current value of PM1A control */ status = acpi_hw_register_read(ACPI_REGISTER_PM1_CONTROL, @@ -214,7 +207,6 @@ acpi_status acpi_hw_legacy_sleep(u8 sleep_state, u8 flags) * FUNCTION: acpi_hw_legacy_wake_prep * * PARAMETERS: sleep_state - Which sleep state we just exited - * flags - ACPI_EXECUTE_BFS to run optional method * * RETURN: Status * @@ -224,7 +216,7 @@ acpi_status acpi_hw_legacy_sleep(u8 sleep_state, u8 flags) * ******************************************************************************/ -acpi_status acpi_hw_legacy_wake_prep(u8 sleep_state, u8 flags) +acpi_status acpi_hw_legacy_wake_prep(u8 sleep_state) { acpi_status status; struct acpi_bit_register_info *sleep_type_reg_info; @@ -275,11 +267,6 @@ acpi_status acpi_hw_legacy_wake_prep(u8 sleep_state, u8 flags) } } - /* Optionally execute _BFS (Back From Sleep) */ - - if (flags & ACPI_EXECUTE_BFS) { - acpi_hw_execute_sleep_method(METHOD_PATHNAME__BFS, sleep_state); - } return_ACPI_STATUS(status); } @@ -288,7 +275,6 @@ acpi_status acpi_hw_legacy_wake_prep(u8 sleep_state, u8 flags) * FUNCTION: acpi_hw_legacy_wake * * PARAMETERS: sleep_state - Which sleep state we just exited - * flags - Reserved, set to zero * * RETURN: Status * @@ -297,7 +283,7 @@ acpi_status acpi_hw_legacy_wake_prep(u8 sleep_state, u8 flags) * ******************************************************************************/ -acpi_status acpi_hw_legacy_wake(u8 sleep_state, u8 flags) +acpi_status acpi_hw_legacy_wake(u8 sleep_state) { acpi_status status; diff --git a/drivers/acpi/acpica/hwxfsleep.c b/drivers/acpi/acpica/hwxfsleep.c index f8684bfe790..1f165a750ae 100644 --- a/drivers/acpi/acpica/hwxfsleep.c +++ b/drivers/acpi/acpica/hwxfsleep.c @@ -50,7 +50,7 @@ ACPI_MODULE_NAME("hwxfsleep") /* Local prototypes */ static acpi_status -acpi_hw_sleep_dispatch(u8 sleep_state, u8 flags, u32 function_id); +acpi_hw_sleep_dispatch(u8 sleep_state, u32 function_id); /* * Dispatch table used to efficiently branch to the various sleep @@ -235,7 +235,7 @@ ACPI_EXPORT_SYMBOL(acpi_enter_sleep_state_s4bios) * ******************************************************************************/ static acpi_status -acpi_hw_sleep_dispatch(u8 sleep_state, u8 flags, u32 function_id) +acpi_hw_sleep_dispatch(u8 sleep_state, u32 function_id) { acpi_status status; struct acpi_sleep_functions *sleep_functions = @@ -248,11 +248,11 @@ acpi_hw_sleep_dispatch(u8 sleep_state, u8 flags, u32 function_id) * use the extended sleep registers */ if (acpi_gbl_reduced_hardware || acpi_gbl_FADT.sleep_control.address) { - status = sleep_functions->extended_function(sleep_state, flags); + status = sleep_functions->extended_function(sleep_state); } else { /* Legacy sleep */ - status = sleep_functions->legacy_function(sleep_state, flags); + status = sleep_functions->legacy_function(sleep_state); } return (status); @@ -262,7 +262,7 @@ acpi_hw_sleep_dispatch(u8 sleep_state, u8 flags, u32 function_id) * For the case where reduced-hardware-only code is being generated, * we know that only the extended sleep registers are available */ - status = sleep_functions->extended_function(sleep_state, flags); + status = sleep_functions->extended_function(sleep_state); return (status); #endif /* !ACPI_REDUCED_HARDWARE */ @@ -349,7 +349,6 @@ ACPI_EXPORT_SYMBOL(acpi_enter_sleep_state_prep) * FUNCTION: acpi_enter_sleep_state * * PARAMETERS: sleep_state - Which sleep state to enter - * flags - ACPI_EXECUTE_GTS to run optional method * * RETURN: Status * @@ -357,7 +356,7 @@ ACPI_EXPORT_SYMBOL(acpi_enter_sleep_state_prep) * THIS FUNCTION MUST BE CALLED WITH INTERRUPTS DISABLED * ******************************************************************************/ -acpi_status asmlinkage acpi_enter_sleep_state(u8 sleep_state, u8 flags) +acpi_status asmlinkage acpi_enter_sleep_state(u8 sleep_state) { acpi_status status; @@ -371,7 +370,7 @@ acpi_status asmlinkage acpi_enter_sleep_state(u8 sleep_state, u8 flags) } status = - acpi_hw_sleep_dispatch(sleep_state, flags, ACPI_SLEEP_FUNCTION_ID); + acpi_hw_sleep_dispatch(sleep_state, ACPI_SLEEP_FUNCTION_ID); return_ACPI_STATUS(status); } @@ -391,14 +390,14 @@ ACPI_EXPORT_SYMBOL(acpi_enter_sleep_state) * Called with interrupts DISABLED. * ******************************************************************************/ -acpi_status acpi_leave_sleep_state_prep(u8 sleep_state, u8 flags) +acpi_status acpi_leave_sleep_state_prep(u8 sleep_state) { acpi_status status; ACPI_FUNCTION_TRACE(acpi_leave_sleep_state_prep); status = - acpi_hw_sleep_dispatch(sleep_state, flags, + acpi_hw_sleep_dispatch(sleep_state, ACPI_WAKE_PREP_FUNCTION_ID); return_ACPI_STATUS(status); } @@ -423,8 +422,7 @@ acpi_status acpi_leave_sleep_state(u8 sleep_state) ACPI_FUNCTION_TRACE(acpi_leave_sleep_state); - - status = acpi_hw_sleep_dispatch(sleep_state, 0, ACPI_WAKE_FUNCTION_ID); + status = acpi_hw_sleep_dispatch(sleep_state, ACPI_WAKE_FUNCTION_ID); return_ACPI_STATUS(status); } diff --git a/drivers/acpi/sleep.c b/drivers/acpi/sleep.c index 23a53c013f1..bd35e3c5e53 100644 --- a/drivers/acpi/sleep.c +++ b/drivers/acpi/sleep.c @@ -28,34 +28,6 @@ #include "internal.h" #include "sleep.h" -u8 wake_sleep_flags = ACPI_NO_OPTIONAL_METHODS; -static unsigned int gts, bfs; -static int set_param_wake_flag(const char *val, struct kernel_param *kp) -{ - int ret = param_set_int(val, kp); - - if (ret) - return ret; - - if (kp->arg == (const char *)>s) { - if (gts) - wake_sleep_flags |= ACPI_EXECUTE_GTS; - else - wake_sleep_flags &= ~ACPI_EXECUTE_GTS; - } - if (kp->arg == (const char *)&bfs) { - if (bfs) - wake_sleep_flags |= ACPI_EXECUTE_BFS; - else - wake_sleep_flags &= ~ACPI_EXECUTE_BFS; - } - return ret; -} -module_param_call(gts, set_param_wake_flag, param_get_int, >s, 0644); -module_param_call(bfs, set_param_wake_flag, param_get_int, &bfs, 0644); -MODULE_PARM_DESC(gts, "Enable evaluation of _GTS on suspend."); -MODULE_PARM_DESC(bfs, "Enable evaluation of _BFS on resume".); - static u8 sleep_states[ACPI_S_STATE_COUNT]; static bool pwr_btn_event_pending; @@ -305,7 +277,7 @@ static int acpi_suspend_enter(suspend_state_t pm_state) switch (acpi_state) { case ACPI_STATE_S1: barrier(); - status = acpi_enter_sleep_state(acpi_state, wake_sleep_flags); + status = acpi_enter_sleep_state(acpi_state); break; case ACPI_STATE_S3: @@ -319,8 +291,8 @@ static int acpi_suspend_enter(suspend_state_t pm_state) /* This violates the spec but is required for bug compatibility. */ acpi_write_bit_register(ACPI_BITREG_SCI_ENABLE, 1); - /* Reprogram control registers and execute _BFS */ - acpi_leave_sleep_state_prep(acpi_state, wake_sleep_flags); + /* Reprogram control registers */ + acpi_leave_sleep_state_prep(acpi_state); /* ACPI 3.0 specs (P62) says that it's the responsibility * of the OSPM to clear the status bit [ implying that the @@ -603,9 +575,9 @@ static int acpi_hibernation_enter(void) ACPI_FLUSH_CPU_CACHE(); /* This shouldn't return. If it returns, we have a problem */ - status = acpi_enter_sleep_state(ACPI_STATE_S4, wake_sleep_flags); - /* Reprogram control registers and execute _BFS */ - acpi_leave_sleep_state_prep(ACPI_STATE_S4, wake_sleep_flags); + status = acpi_enter_sleep_state(ACPI_STATE_S4); + /* Reprogram control registers */ + acpi_leave_sleep_state_prep(ACPI_STATE_S4); return ACPI_SUCCESS(status) ? 0 : -EFAULT; } @@ -617,8 +589,8 @@ static void acpi_hibernation_leave(void) * enable it here. */ acpi_enable(); - /* Reprogram control registers and execute _BFS */ - acpi_leave_sleep_state_prep(ACPI_STATE_S4, wake_sleep_flags); + /* Reprogram control registers */ + acpi_leave_sleep_state_prep(ACPI_STATE_S4); /* Check the hardware signature */ if (facs && s4_hardware_signature != facs->hardware_signature) { printk(KERN_EMERG "ACPI: Hardware changed while hibernated, " @@ -876,33 +848,7 @@ static void acpi_power_off(void) /* acpi_sleep_prepare(ACPI_STATE_S5) should have already been called */ printk(KERN_DEBUG "%s called\n", __func__); local_irq_disable(); - acpi_enter_sleep_state(ACPI_STATE_S5, wake_sleep_flags); -} - -/* - * ACPI 2.0 created the optional _GTS and _BFS, - * but industry adoption has been neither rapid nor broad. - * - * Linux gets into trouble when it executes poorly validated - * paths through the BIOS, so disable _GTS and _BFS by default, - * but do speak up and offer the option to enable them. - */ -static void __init acpi_gts_bfs_check(void) -{ - acpi_handle dummy; - - if (ACPI_SUCCESS(acpi_get_handle(ACPI_ROOT_OBJECT, METHOD_PATHNAME__GTS, &dummy))) - { - printk(KERN_NOTICE PREFIX "BIOS offers _GTS\n"); - printk(KERN_NOTICE PREFIX "If \"acpi.gts=1\" improves suspend, " - "please notify linux-acpi@vger.kernel.org\n"); - } - if (ACPI_SUCCESS(acpi_get_handle(ACPI_ROOT_OBJECT, METHOD_PATHNAME__BFS, &dummy))) - { - printk(KERN_NOTICE PREFIX "BIOS offers _BFS\n"); - printk(KERN_NOTICE PREFIX "If \"acpi.bfs=1\" improves resume, " - "please notify linux-acpi@vger.kernel.org\n"); - } + acpi_enter_sleep_state(ACPI_STATE_S5); } int __init acpi_sleep_init(void) @@ -963,6 +909,5 @@ int __init acpi_sleep_init(void) * object can also be evaluated when the system enters S5. */ register_reboot_notifier(&tts_notifier); - acpi_gts_bfs_check(); return 0; } diff --git a/include/acpi/acpixf.h b/include/acpi/acpixf.h index 2c744c7a5b3..26a92fc28a5 100644 --- a/include/acpi/acpixf.h +++ b/include/acpi/acpixf.h @@ -491,11 +491,11 @@ acpi_get_sleep_type_data(u8 sleep_state, u8 * slp_typ_a, u8 * slp_typ_b); acpi_status acpi_enter_sleep_state_prep(u8 sleep_state); -acpi_status asmlinkage acpi_enter_sleep_state(u8 sleep_state, u8 flags); +acpi_status asmlinkage acpi_enter_sleep_state(u8 sleep_state); ACPI_HW_DEPENDENT_RETURN_STATUS(acpi_status asmlinkage acpi_enter_sleep_state_s4bios(void)) -acpi_status acpi_leave_sleep_state_prep(u8 sleep_state, u8 flags); +acpi_status acpi_leave_sleep_state_prep(u8 sleep_state); acpi_status acpi_leave_sleep_state(u8 sleep_state); diff --git a/include/acpi/actypes.h b/include/acpi/actypes.h index 3af87de6a68..3d00bd5bd7e 100644 --- a/include/acpi/actypes.h +++ b/include/acpi/actypes.h @@ -803,7 +803,7 @@ typedef u8 acpi_adr_space_type; /* Sleep function dispatch */ -typedef acpi_status(*ACPI_SLEEP_FUNCTION) (u8 sleep_state, u8 flags); +typedef acpi_status(*ACPI_SLEEP_FUNCTION) (u8 sleep_state); struct acpi_sleep_functions { ACPI_SLEEP_FUNCTION legacy_function; -- cgit v1.2.3-70-g09d2 From 0f26d0e0a715556270d85b7946b99546a2f92888 Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Mon, 30 Jul 2012 22:44:41 -0500 Subject: kdb: Remove unused KDB_FLAG_ONLY_DO_DUMP This code cleanup was missed in the original kdb merge, and this code is simply not used at all. The code that was previously used to set the KDB_FLAG_ONLY_DO_DUMP was removed prior to the initial kdb merge. Signed-off-by: Jason Wessel --- include/linux/kdb.h | 2 -- kernel/debug/kdb/kdb_main.c | 15 +-------------- 2 files changed, 1 insertion(+), 16 deletions(-) (limited to 'include') diff --git a/include/linux/kdb.h b/include/linux/kdb.h index 064725854db..42d9e863a31 100644 --- a/include/linux/kdb.h +++ b/include/linux/kdb.h @@ -75,8 +75,6 @@ extern const char *kdb_diemsg; #define KDB_FLAG_CATASTROPHIC (1 << 1) /* A catastrophic event has occurred */ #define KDB_FLAG_CMD_INTERRUPT (1 << 2) /* Previous command was interrupted */ #define KDB_FLAG_NOIPI (1 << 3) /* Do not send IPIs */ -#define KDB_FLAG_ONLY_DO_DUMP (1 << 4) /* Only do a dump, used when - * kdb is off */ #define KDB_FLAG_NO_CONSOLE (1 << 5) /* No console is available, * kdb is disabled */ #define KDB_FLAG_NO_VT_CONSOLE (1 << 6) /* No VT console is available, do diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 1f91413edb8..31df1706b9a 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -139,11 +139,10 @@ static const int __nkdb_err = sizeof(kdbmsgs) / sizeof(kdbmsg_t); static char *__env[] = { #if defined(CONFIG_SMP) "PROMPT=[%d]kdb> ", - "MOREPROMPT=[%d]more> ", #else "PROMPT=kdb> ", - "MOREPROMPT=more> ", #endif + "MOREPROMPT=more> ", "RADIX=16", "MDCOUNT=8", /* lines of md output */ KDB_PLATFORM_ENV, @@ -1236,18 +1235,6 @@ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs, *cmdbuf = '\0'; *(cmd_hist[cmd_head]) = '\0'; - if (KDB_FLAG(ONLY_DO_DUMP)) { - /* kdb is off but a catastrophic error requires a dump. - * Take the dump and reboot. - * Turn on logging so the kdb output appears in the log - * buffer in the dump. - */ - const char *setargs[] = { "set", "LOGGING", "1" }; - kdb_set(2, setargs); - kdb_reboot(0, NULL); - /*NOTREACHED*/ - } - do_full_getstr: #if defined(CONFIG_SMP) snprintf(kdb_prompt_str, CMD_BUFLEN, kdbgetenv("PROMPT"), -- cgit v1.2.3-70-g09d2 From e6dab5ffab59e910ec0e3355f4a6f29f7a7be474 Mon Sep 17 00:00:00 2001 From: Andrew Vagin Date: Wed, 11 Jul 2012 18:14:58 +0400 Subject: perf/trace: Add ability to set a target task for events A few events are interesting not only for a current task. For example, sched_stat_* events are interesting for a task which wakes up. For this reason, it will be good if such events will be delivered to a target task too. Now a target task can be set by using __perf_task(). The original idea and a draft patch belongs to Peter Zijlstra. I need these events for profiling sleep times. sched_switch is used for getting callchains and sched_stat_* is used for getting time periods. These events are combined in user space, then it can be analyzed by perf tools. Inspired-by: Peter Zijlstra Cc: Steven Rostedt Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Steven Rostedt Cc: Arun Sharma Signed-off-by: Andrew Vagin Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1342016098-213063-1-git-send-email-avagin@openvz.org Signed-off-by: Ingo Molnar --- include/linux/ftrace_event.h | 5 +++-- include/linux/perf_event.h | 3 ++- include/trace/events/sched.h | 4 ++++ include/trace/ftrace.h | 6 +++++- kernel/events/callchain.c | 9 ++++++++- kernel/events/core.c | 30 ++++++++++++++++++++++++++++-- kernel/events/internal.h | 3 ++- kernel/trace/trace_event_perf.c | 2 +- kernel/trace/trace_kprobe.c | 6 ++++-- kernel/trace/trace_syscalls.c | 4 ++-- kernel/trace/trace_uprobe.c | 2 +- 11 files changed, 60 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index af961d6f7ab..642928cf57b 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -306,9 +306,10 @@ extern void *perf_trace_buf_prepare(int size, unsigned short type, static inline void perf_trace_buf_submit(void *raw_data, int size, int rctx, u64 addr, - u64 count, struct pt_regs *regs, void *head) + u64 count, struct pt_regs *regs, void *head, + struct task_struct *task) { - perf_tp_event(addr, count, raw_data, size, regs, head, rctx); + perf_tp_event(addr, count, raw_data, size, regs, head, rctx, task); } #endif diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 76c5c8b724a..7602ccb3f40 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1272,7 +1272,8 @@ static inline bool perf_paranoid_kernel(void) extern void perf_event_init(void); extern void perf_tp_event(u64 addr, u64 count, void *record, int entry_size, struct pt_regs *regs, - struct hlist_head *head, int rctx); + struct hlist_head *head, int rctx, + struct task_struct *task); extern void perf_bp_event(struct perf_event *event, void *data); #ifndef perf_misc_flags diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index ea7a2035456..5a8671e8a67 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -73,6 +73,9 @@ DECLARE_EVENT_CLASS(sched_wakeup_template, __entry->prio = p->prio; __entry->success = success; __entry->target_cpu = task_cpu(p); + ) + TP_perf_assign( + __perf_task(p); ), TP_printk("comm=%s pid=%d prio=%d success=%d target_cpu=%03d", @@ -325,6 +328,7 @@ DECLARE_EVENT_CLASS(sched_stat_template, ) TP_perf_assign( __perf_count(delay); + __perf_task(tsk); ), TP_printk("comm=%s pid=%d delay=%Lu [ns]", diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index c6bc2faaf26..a763888a36f 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -712,6 +712,9 @@ __attribute__((section("_ftrace_events"))) *__event_##call = &event_##call #undef __perf_count #define __perf_count(c) __count = (c) +#undef __perf_task +#define __perf_task(t) __task = (t) + #undef TP_perf_assign #define TP_perf_assign(args...) args @@ -725,6 +728,7 @@ perf_trace_##call(void *__data, proto) \ struct ftrace_raw_##call *entry; \ struct pt_regs __regs; \ u64 __addr = 0, __count = 1; \ + struct task_struct *__task = NULL; \ struct hlist_head *head; \ int __entry_size; \ int __data_size; \ @@ -752,7 +756,7 @@ perf_trace_##call(void *__data, proto) \ \ head = this_cpu_ptr(event_call->perf_events); \ perf_trace_buf_submit(entry, __entry_size, rctx, __addr, \ - __count, &__regs, head); \ + __count, &__regs, head, __task); \ } /* diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index 6581a040f39..98d4597f43d 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -153,7 +153,8 @@ put_callchain_entry(int rctx) put_recursion_context(__get_cpu_var(callchain_recursion), rctx); } -struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) +struct perf_callchain_entry * +perf_callchain(struct perf_event *event, struct pt_regs *regs) { int rctx; struct perf_callchain_entry *entry; @@ -178,6 +179,12 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) } if (regs) { + /* + * Disallow cross-task user callchains. + */ + if (event->ctx->task && event->ctx->task != current) + goto exit_put; + perf_callchain_store(entry, PERF_CONTEXT_USER); perf_callchain_user(entry, regs); } diff --git a/kernel/events/core.c b/kernel/events/core.c index f1cf0edeb39..b7935fcec7d 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4039,7 +4039,7 @@ void perf_prepare_sample(struct perf_event_header *header, if (sample_type & PERF_SAMPLE_CALLCHAIN) { int size = 1; - data->callchain = perf_callchain(regs); + data->callchain = perf_callchain(event, regs); if (data->callchain) size += data->callchain->nr; @@ -5209,7 +5209,8 @@ static int perf_tp_event_match(struct perf_event *event, } void perf_tp_event(u64 addr, u64 count, void *record, int entry_size, - struct pt_regs *regs, struct hlist_head *head, int rctx) + struct pt_regs *regs, struct hlist_head *head, int rctx, + struct task_struct *task) { struct perf_sample_data data; struct perf_event *event; @@ -5228,6 +5229,31 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size, perf_swevent_event(event, count, &data, regs); } + /* + * If we got specified a target task, also iterate its context and + * deliver this event there too. + */ + if (task && task != current) { + struct perf_event_context *ctx; + struct trace_entry *entry = record; + + rcu_read_lock(); + ctx = rcu_dereference(task->perf_event_ctxp[perf_sw_context]); + if (!ctx) + goto unlock; + + list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { + if (event->attr.type != PERF_TYPE_TRACEPOINT) + continue; + if (event->attr.config != entry->type) + continue; + if (perf_tp_event_match(event, &data, regs)) + perf_swevent_event(event, count, &data, regs); + } +unlock: + rcu_read_unlock(); + } + perf_swevent_put_recursion_context(rctx); } EXPORT_SYMBOL_GPL(perf_tp_event); diff --git a/kernel/events/internal.h b/kernel/events/internal.h index b0b107f90af..a096c19f2c2 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -101,7 +101,8 @@ __output_copy(struct perf_output_handle *handle, } /* Callchain handling */ -extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs); +extern struct perf_callchain_entry * +perf_callchain(struct perf_event *event, struct pt_regs *regs); extern int get_callchain_buffers(void); extern void put_callchain_buffers(void); diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index fee3752ae8f..8a6d2ee2086 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -281,7 +281,7 @@ perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip) head = this_cpu_ptr(event_function.perf_events); perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, 0, - 1, ®s, head); + 1, ®s, head, NULL); #undef ENTRY_SIZE } diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index b31d3d5699f..1a2117043bb 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1002,7 +1002,8 @@ static __kprobes void kprobe_perf_func(struct kprobe *kp, store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); head = this_cpu_ptr(call->perf_events); - perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head); + perf_trace_buf_submit(entry, size, rctx, + entry->ip, 1, regs, head, NULL); } /* Kretprobe profile handler */ @@ -1033,7 +1034,8 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri, store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); head = this_cpu_ptr(call->perf_events); - perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1, regs, head); + perf_trace_buf_submit(entry, size, rctx, + entry->ret_ip, 1, regs, head, NULL); } #endif /* CONFIG_PERF_EVENTS */ diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 96fc7336909..60e4d787567 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -532,7 +532,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) (unsigned long *)&rec->args); head = this_cpu_ptr(sys_data->enter_event->perf_events); - perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head); + perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL); } int perf_sysenter_enable(struct ftrace_event_call *call) @@ -608,7 +608,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) rec->ret = syscall_get_return_value(current, regs); head = this_cpu_ptr(sys_data->exit_event->perf_events); - perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head); + perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL); } int perf_sysexit_enable(struct ftrace_event_call *call) diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 2b36ac68549..03003cd7dd9 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -670,7 +670,7 @@ static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs) call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset); head = this_cpu_ptr(call->perf_events); - perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head); + perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head, NULL); out: preempt_enable(); -- cgit v1.2.3-70-g09d2 From a7ea3bbf5d58f4df2265d312f91d5769eabc8144 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Fri, 27 Jul 2012 14:48:09 -0400 Subject: time/jiffies: Allow CLOCK_TICK_RATE to be undefined CLOCK_TICK_RATE is a legacy constant that defines the timer device's granularity. On hardware with particularly coarse granularity, this constant is used to reduce accumulated time error when using jiffies as a clocksource, by calculating the hardware's actual tick length rather then just assuming it is 1sec/HZ. However, for the most part this is unnecessary, as most modern systems don't use jiffies for their clocksource, and their tick device is sufficiently fine grained to avoid major error. Thus, this patch allows an architecture to not define CLOCK_TICK_RATE, in which case ACTHZ defaults to (HZ << 8). Signed-off-by: Catalin Marinas Acked-by: Arnd Bergmann Cc: Richard Cochran Cc: Prarit Bhargava Cc: Andrew Morton [ Commit log & intention tweaks ] Signed-off-by: John Stultz Link: http://lkml.kernel.org/r/1343414893-45779-2-git-send-email-john.stultz@linaro.org Signed-off-by: Ingo Molnar --- include/linux/jiffies.h | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h index 265e2c3cbd1..7d24466fb80 100644 --- a/include/linux/jiffies.h +++ b/include/linux/jiffies.h @@ -39,9 +39,6 @@ # error Invalid value of HZ. #endif -/* LATCH is used in the interval timer and ftape setup. */ -#define LATCH ((CLOCK_TICK_RATE + HZ/2) / HZ) /* For divider */ - /* Suppose we want to divide two numbers NOM and DEN: NOM/DEN, then we can * improve accuracy by shifting LSH bits, hence calculating: * (NOM << LSH) / DEN @@ -54,8 +51,15 @@ #define SH_DIV(NOM,DEN,LSH) ( (((NOM) / (DEN)) << (LSH)) \ + ((((NOM) % (DEN)) << (LSH)) + (DEN) / 2) / (DEN)) +#ifdef CLOCK_TICK_RATE +/* LATCH is used in the interval timer and ftape setup. */ +# define LATCH ((CLOCK_TICK_RATE + HZ/2) / HZ) /* For divider */ + /* HZ is the requested value. ACTHZ is actual HZ ("<< 8" is for accuracy) */ -#define ACTHZ (SH_DIV (CLOCK_TICK_RATE, LATCH, 8)) +# define ACTHZ (SH_DIV(CLOCK_TICK_RATE, LATCH, 8)) +#else +# define ACTHZ (HZ << 8) +#endif /* TICK_NSEC is the time between ticks in nsec assuming real ACTHZ */ #define TICK_NSEC (SH_DIV (1000000UL * 1000, ACTHZ, 8)) -- cgit v1.2.3-70-g09d2 From 02ab20ae38337b99b5c29c81090f594b8fd61283 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Fri, 27 Jul 2012 14:48:10 -0400 Subject: time/jiffies: Rename ACTHZ to SHIFTED_HZ Ingo noted that ACTHZ is a confusing name, and requested it be renamed, so this patch renames ACTHZ to SHIFTED_HZ to better describe it. Signed-off-by: John Stultz Cc: Prarit Bhargava Link: http://lkml.kernel.org/r/1343414893-45779-3-git-send-email-john.stultz@linaro.org Signed-off-by: Ingo Molnar --- include/linux/jiffies.h | 21 +++++++++++++-------- include/linux/timex.h | 2 +- kernel/time/jiffies.c | 2 +- kernel/time/ntp.c | 2 +- 4 files changed, 16 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h index 7d24466fb80..82680541576 100644 --- a/include/linux/jiffies.h +++ b/include/linux/jiffies.h @@ -55,21 +55,26 @@ /* LATCH is used in the interval timer and ftape setup. */ # define LATCH ((CLOCK_TICK_RATE + HZ/2) / HZ) /* For divider */ -/* HZ is the requested value. ACTHZ is actual HZ ("<< 8" is for accuracy) */ -# define ACTHZ (SH_DIV(CLOCK_TICK_RATE, LATCH, 8)) +/* + * HZ is the requested value. However the CLOCK_TICK_RATE may not allow + * for exactly HZ. So SHIFTED_HZ is high res HZ ("<< 8" is for accuracy) + */ +# define SHIFTED_HZ (SH_DIV(CLOCK_TICK_RATE, LATCH, 8)) #else -# define ACTHZ (HZ << 8) +# define SHIFTED_HZ (HZ << 8) #endif -/* TICK_NSEC is the time between ticks in nsec assuming real ACTHZ */ -#define TICK_NSEC (SH_DIV (1000000UL * 1000, ACTHZ, 8)) +/* TICK_NSEC is the time between ticks in nsec assuming SHIFTED_HZ */ +#define TICK_NSEC (SH_DIV(1000000UL * 1000, SHIFTED_HZ, 8)) /* TICK_USEC is the time between ticks in usec assuming fake USER_HZ */ #define TICK_USEC ((1000000UL + USER_HZ/2) / USER_HZ) -/* TICK_USEC_TO_NSEC is the time between ticks in nsec assuming real ACTHZ and */ -/* a value TUSEC for TICK_USEC (can be set bij adjtimex) */ -#define TICK_USEC_TO_NSEC(TUSEC) (SH_DIV (TUSEC * USER_HZ * 1000, ACTHZ, 8)) +/* + * TICK_USEC_TO_NSEC is the time between ticks in nsec assuming SHIFTED_HZ and + * a value TUSEC for TICK_USEC (can be set bij adjtimex) + */ +#define TICK_USEC_TO_NSEC(TUSEC) (SH_DIV(TUSEC * USER_HZ * 1000, SHIFTED_HZ, 8)) /* some arch's have a small-data section that can be accessed register-relative * but that can only take up to, say, 4-byte variables. jiffies being part of diff --git a/include/linux/timex.h b/include/linux/timex.h index 99bc88b1fc0..7c5ceb20e03 100644 --- a/include/linux/timex.h +++ b/include/linux/timex.h @@ -232,7 +232,7 @@ struct timex { * estimated error = NTP dispersion. */ extern unsigned long tick_usec; /* USER_HZ period (usec) */ -extern unsigned long tick_nsec; /* ACTHZ period (nsec) */ +extern unsigned long tick_nsec; /* SHIFTED_HZ period (nsec) */ extern void ntp_init(void); extern void ntp_clear(void); diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index a470154e040..46da0537c10 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -37,7 +37,7 @@ * requested HZ value. It is also not recommended * for "tick-less" systems. */ -#define NSEC_PER_JIFFY ((u32)((((u64)NSEC_PER_SEC)<<8)/ACTHZ)) +#define NSEC_PER_JIFFY ((u32)((((u64)NSEC_PER_SEC)<<8)/SHIFTED_HZ)) /* Since jiffies uses a simple NSEC_PER_JIFFY multiplier * conversion, the .shift value could be zero. However diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index b7fbadc5c97..24174b4d669 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -28,7 +28,7 @@ DEFINE_SPINLOCK(ntp_lock); /* USER_HZ period (usecs): */ unsigned long tick_usec = TICK_USEC; -/* ACTHZ period (nsecs): */ +/* SHIFTED_HZ period (nsecs): */ unsigned long tick_nsec; static u64 tick_length; -- cgit v1.2.3-70-g09d2 From 30b678d844af3305cda5953467005cebb5d7b687 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Mon, 30 Jul 2012 15:57:00 +0000 Subject: net: Allow driver to limit number of GSO segments per skb A peer (or local user) may cause TCP to use a nominal MSS of as little as 88 (actual MSS of 76 with timestamps). Given that we have a sufficiently prodigious local sender and the peer ACKs quickly enough, it is nevertheless possible to grow the window for such a connection to the point that we will try to send just under 64K at once. This results in a single skb that expands to 861 segments. In some drivers with TSO support, such an skb will require hundreds of DMA descriptors; a substantial fraction of a TX ring or even more than a full ring. The TX queue selected for the skb may stall and trigger the TX watchdog repeatedly (since the problem skb will be retried after the TX reset). This particularly affects sfc, for which the issue is designated as CVE-2012-3412. Therefore: 1. Add the field net_device::gso_max_segs holding the device-specific limit. 2. In netif_skb_features(), if the number of segments is too high then mask out GSO features to force fall back to software GSO. Signed-off-by: Ben Hutchings Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 ++ net/core/dev.c | 4 ++++ 2 files changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index eb06e58bed0..a9db4f33407 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1300,6 +1300,8 @@ struct net_device { /* for setting kernel sock attribute on TCP connection setup */ #define GSO_MAX_SIZE 65536 unsigned int gso_max_size; +#define GSO_MAX_SEGS 65535 + u16 gso_max_segs; #ifdef CONFIG_DCB /* Data Center Bridging netlink ops */ diff --git a/net/core/dev.c b/net/core/dev.c index 0cb3fe8d8e7..f91abf80016 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2134,6 +2134,9 @@ netdev_features_t netif_skb_features(struct sk_buff *skb) __be16 protocol = skb->protocol; netdev_features_t features = skb->dev->features; + if (skb_shinfo(skb)->gso_segs > skb->dev->gso_max_segs) + features &= ~NETIF_F_GSO_MASK; + if (protocol == htons(ETH_P_8021Q)) { struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data; protocol = veh->h_vlan_encapsulated_proto; @@ -5986,6 +5989,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, dev_net_set(dev, &init_net); dev->gso_max_size = GSO_MAX_SIZE; + dev->gso_max_segs = GSO_MAX_SEGS; INIT_LIST_HEAD(&dev->napi_list); INIT_LIST_HEAD(&dev->unreg_list); -- cgit v1.2.3-70-g09d2 From 1485348d2424e1131ea42efc033cbd9366462b01 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Mon, 30 Jul 2012 16:11:42 +0000 Subject: tcp: Apply device TSO segment limit earlier Cache the device gso_max_segs in sock::sk_gso_max_segs and use it to limit the size of TSO skbs. This avoids the need to fall back to software GSO for local TCP senders. Signed-off-by: Ben Hutchings Signed-off-by: David S. Miller --- include/net/sock.h | 2 ++ net/core/sock.c | 1 + net/ipv4/tcp.c | 4 +++- net/ipv4/tcp_cong.c | 3 ++- net/ipv4/tcp_output.c | 21 ++++++++++++--------- 5 files changed, 20 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index b3730239bf1..72132aef53f 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -218,6 +218,7 @@ struct cg_proto; * @sk_route_nocaps: forbidden route capabilities (e.g NETIF_F_GSO_MASK) * @sk_gso_type: GSO type (e.g. %SKB_GSO_TCPV4) * @sk_gso_max_size: Maximum GSO segment size to build + * @sk_gso_max_segs: Maximum number of GSO segments * @sk_lingertime: %SO_LINGER l_linger setting * @sk_backlog: always used with the per-socket spinlock held * @sk_callback_lock: used with the callbacks in the end of this struct @@ -338,6 +339,7 @@ struct sock { netdev_features_t sk_route_nocaps; int sk_gso_type; unsigned int sk_gso_max_size; + u16 sk_gso_max_segs; int sk_rcvlowat; unsigned long sk_lingertime; struct sk_buff_head sk_error_queue; diff --git a/net/core/sock.c b/net/core/sock.c index 6b654b3ddfd..8f67ced8d6a 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1458,6 +1458,7 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst) } else { sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM; sk->sk_gso_max_size = dst->dev->gso_max_size; + sk->sk_gso_max_segs = dst->dev->gso_max_segs; } } } diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index e7e6eeae49c..2109ff4a1da 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -811,7 +811,9 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now, old_size_goal + mss_now > xmit_size_goal)) { xmit_size_goal = old_size_goal; } else { - tp->xmit_size_goal_segs = xmit_size_goal / mss_now; + tp->xmit_size_goal_segs = + min_t(u16, xmit_size_goal / mss_now, + sk->sk_gso_max_segs); xmit_size_goal = tp->xmit_size_goal_segs * mss_now; } } diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 4d4db16e336..1432cdb0644 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -291,7 +291,8 @@ bool tcp_is_cwnd_limited(const struct sock *sk, u32 in_flight) left = tp->snd_cwnd - in_flight; if (sk_can_gso(sk) && left * sysctl_tcp_tso_win_divisor < tp->snd_cwnd && - left * tp->mss_cache < sk->sk_gso_max_size) + left * tp->mss_cache < sk->sk_gso_max_size && + left < sk->sk_gso_max_segs) return true; return left <= tcp_max_tso_deferred_mss(tp); } diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 3f1bcff0b10..a7b3ec9b6c3 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1522,21 +1522,21 @@ static void tcp_cwnd_validate(struct sock *sk) * when we would be allowed to send the split-due-to-Nagle skb fully. */ static unsigned int tcp_mss_split_point(const struct sock *sk, const struct sk_buff *skb, - unsigned int mss_now, unsigned int cwnd) + unsigned int mss_now, unsigned int max_segs) { const struct tcp_sock *tp = tcp_sk(sk); - u32 needed, window, cwnd_len; + u32 needed, window, max_len; window = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq; - cwnd_len = mss_now * cwnd; + max_len = mss_now * max_segs; - if (likely(cwnd_len <= window && skb != tcp_write_queue_tail(sk))) - return cwnd_len; + if (likely(max_len <= window && skb != tcp_write_queue_tail(sk))) + return max_len; needed = min(skb->len, window); - if (cwnd_len <= needed) - return cwnd_len; + if (max_len <= needed) + return max_len; return needed - needed % mss_now; } @@ -1765,7 +1765,8 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb) limit = min(send_win, cong_win); /* If a full-sized TSO skb can be sent, do it. */ - if (limit >= sk->sk_gso_max_size) + if (limit >= min_t(unsigned int, sk->sk_gso_max_size, + sk->sk_gso_max_segs * tp->mss_cache)) goto send_now; /* Middle in queue won't get any more data, full sendable already? */ @@ -1999,7 +2000,9 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, limit = mss_now; if (tso_segs > 1 && !tcp_urg_mode(tp)) limit = tcp_mss_split_point(sk, skb, mss_now, - cwnd_quota); + min_t(unsigned int, + cwnd_quota, + sk->sk_gso_max_segs)); if (skb->len > limit && unlikely(tso_fragment(sk, skb, limit, mss_now, gfp))) -- cgit v1.2.3-70-g09d2 From e3c0d04750751389d5116267f8cf4687444d9a50 Mon Sep 17 00:00:00 2001 From: Fan Du Date: Mon, 30 Jul 2012 21:43:54 +0000 Subject: Fix unexpected SA hard expiration after changing date After SA is setup, one timer is armed to detect soft/hard expiration, however the timer handler uses xtime to do the math. This makes hard expiration occurs first before soft expiration after setting new date with big interval. As a result new child SA is deleted before rekeying the new one. Signed-off-by: Fan Du Signed-off-by: David S. Miller --- include/net/xfrm.h | 4 ++++ net/xfrm/xfrm_state.c | 21 +++++++++++++++++---- 2 files changed, 21 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index d9509eb29b8..62b619e82a9 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -213,6 +213,9 @@ struct xfrm_state { struct xfrm_lifetime_cur curlft; struct tasklet_hrtimer mtimer; + /* used to fix curlft->add_time when changing date */ + long saved_tmo; + /* Last used time */ unsigned long lastused; @@ -238,6 +241,7 @@ static inline struct net *xs_net(struct xfrm_state *x) /* xflags - make enum if more show up */ #define XFRM_TIME_DEFER 1 +#define XFRM_SOFT_EXPIRE 2 enum { XFRM_STATE_VOID, diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 5b228f97d4b..87cd0e4d428 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -415,8 +415,17 @@ static enum hrtimer_restart xfrm_timer_handler(struct hrtimer * me) if (x->lft.hard_add_expires_seconds) { long tmo = x->lft.hard_add_expires_seconds + x->curlft.add_time - now; - if (tmo <= 0) - goto expired; + if (tmo <= 0) { + if (x->xflags & XFRM_SOFT_EXPIRE) { + /* enter hard expire without soft expire first?! + * setting a new date could trigger this. + * workarbound: fix x->curflt.add_time by below: + */ + x->curlft.add_time = now - x->saved_tmo - 1; + tmo = x->lft.hard_add_expires_seconds - x->saved_tmo; + } else + goto expired; + } if (tmo < next) next = tmo; } @@ -433,10 +442,14 @@ static enum hrtimer_restart xfrm_timer_handler(struct hrtimer * me) if (x->lft.soft_add_expires_seconds) { long tmo = x->lft.soft_add_expires_seconds + x->curlft.add_time - now; - if (tmo <= 0) + if (tmo <= 0) { warn = 1; - else if (tmo < next) + x->xflags &= ~XFRM_SOFT_EXPIRE; + } else if (tmo < next) { next = tmo; + x->xflags |= XFRM_SOFT_EXPIRE; + x->saved_tmo = tmo; + } } if (x->lft.soft_use_expires_seconds) { long tmo = x->lft.soft_use_expires_seconds + -- cgit v1.2.3-70-g09d2 From 03f6b0843ad6512f27bc2e48f04c21065311e03e Mon Sep 17 00:00:00 2001 From: Seth Forshee Date: Wed, 1 Aug 2012 15:58:42 -0500 Subject: cfg80211: add channel flag to prohibit OFDM operation Currently the only way for wireless drivers to tell whether or not OFDM is allowed on the current channel is to check the regulatory information. However, this requires hodling cfg80211_mutex, which is not visible to the drivers. Other regulatory restrictions are provided as flags in the channel definition, so let's do similarly with OFDM. This patch adds a new flag, IEEE80211_CHAN_NO_OFDM, to tell drivers that OFDM on a channel is not allowed. This flag is set on any channels for which regulatory indicates that OFDM is prohibited. Signed-off-by: Seth Forshee Tested-by: Arend van Spriel Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 2 ++ net/wireless/reg.c | 2 ++ 2 files changed, 4 insertions(+) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 493fa0c7900..3d254e10ff3 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -96,6 +96,7 @@ enum ieee80211_band { * is not permitted. * @IEEE80211_CHAN_NO_HT40MINUS: extension channel below this channel * is not permitted. + * @IEEE80211_CHAN_NO_OFDM: OFDM is not allowed on this channel. */ enum ieee80211_channel_flags { IEEE80211_CHAN_DISABLED = 1<<0, @@ -104,6 +105,7 @@ enum ieee80211_channel_flags { IEEE80211_CHAN_RADAR = 1<<3, IEEE80211_CHAN_NO_HT40PLUS = 1<<4, IEEE80211_CHAN_NO_HT40MINUS = 1<<5, + IEEE80211_CHAN_NO_OFDM = 1<<6, }; #define IEEE80211_CHAN_NO_HT40 \ diff --git a/net/wireless/reg.c b/net/wireless/reg.c index a9175fedeb5..cbf30de79c6 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -680,6 +680,8 @@ static u32 map_regdom_flags(u32 rd_flags) channel_flags |= IEEE80211_CHAN_NO_IBSS; if (rd_flags & NL80211_RRF_DFS) channel_flags |= IEEE80211_CHAN_RADAR; + if (rd_flags & NL80211_RRF_NO_OFDM) + channel_flags |= IEEE80211_CHAN_NO_OFDM; return channel_flags; } -- cgit v1.2.3-70-g09d2 From c263c2c1ad615e935d563cd7be11d417f94895d9 Mon Sep 17 00:00:00 2001 From: Rafał Miłecki Date: Mon, 23 Jul 2012 18:20:12 +0200 Subject: bcma: BCM43228 support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Rafał Miłecki Signed-off-by: John W. Linville --- drivers/bcma/host_pci.c | 1 + drivers/bcma/sprom.c | 4 +++- include/linux/bcma/bcma_driver_chipcommon.h | 6 ++++++ 3 files changed, 10 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/drivers/bcma/host_pci.c b/drivers/bcma/host_pci.c index 11b32d2642d..a6e5672c67e 100644 --- a/drivers/bcma/host_pci.c +++ b/drivers/bcma/host_pci.c @@ -272,6 +272,7 @@ static DEFINE_PCI_DEVICE_TABLE(bcma_pci_bridge_tbl) = { { PCI_DEVICE(PCI_VENDOR_ID_BROADCOM, 0x4331) }, { PCI_DEVICE(PCI_VENDOR_ID_BROADCOM, 0x4353) }, { PCI_DEVICE(PCI_VENDOR_ID_BROADCOM, 0x4357) }, + { PCI_DEVICE(PCI_VENDOR_ID_BROADCOM, 0x4359) }, { PCI_DEVICE(PCI_VENDOR_ID_BROADCOM, 0x4727) }, { 0, }, }; diff --git a/drivers/bcma/sprom.c b/drivers/bcma/sprom.c index 26823d97fd9..9ea4627dc0c 100644 --- a/drivers/bcma/sprom.c +++ b/drivers/bcma/sprom.c @@ -507,7 +507,9 @@ static bool bcma_sprom_onchip_available(struct bcma_bus *bus) /* for these chips OTP is always available */ present = true; break; - + case BCMA_CHIP_ID_BCM43228: + present = chip_status & BCMA_CC_CHIPST_43228_OTP_PRESENT; + break; default: present = false; break; diff --git a/include/linux/bcma/bcma_driver_chipcommon.h b/include/linux/bcma/bcma_driver_chipcommon.h index 3c80885fa82..d323a4b4143 100644 --- a/include/linux/bcma/bcma_driver_chipcommon.h +++ b/include/linux/bcma/bcma_driver_chipcommon.h @@ -89,6 +89,12 @@ #define BCMA_CC_CHIPST_4313_OTP_PRESENT 2 #define BCMA_CC_CHIPST_4331_SPROM_PRESENT 2 #define BCMA_CC_CHIPST_4331_OTP_PRESENT 4 +#define BCMA_CC_CHIPST_43228_ILP_DIV_EN 0x00000001 +#define BCMA_CC_CHIPST_43228_OTP_PRESENT 0x00000002 +#define BCMA_CC_CHIPST_43228_SERDES_REFCLK_PADSEL 0x00000004 +#define BCMA_CC_CHIPST_43228_SDIO_MODE 0x00000008 +#define BCMA_CC_CHIPST_43228_SDIO_OTP_PRESENT 0x00000010 +#define BCMA_CC_CHIPST_43228_SDIO_RESET 0x00000020 #define BCMA_CC_CHIPST_4706_PKG_OPTION BIT(0) /* 0: full-featured package 1: low-cost package */ #define BCMA_CC_CHIPST_4706_SFLASH_PRESENT BIT(1) /* 0: parallel, 1: serial flash is present */ #define BCMA_CC_CHIPST_4706_SFLASH_TYPE BIT(2) /* 0: 8b-p/ST-s flash, 1: 16b-p/Atmal-s flash */ -- cgit v1.2.3-70-g09d2 From f6166384095b7ecf77752b5e9096e6d03d75f7ae Mon Sep 17 00:00:00 2001 From: Peng Tao Date: Thu, 2 Aug 2012 15:36:09 +0300 Subject: NFS41: add pg_layout_private to nfs_pageio_descriptor To allow layout driver to pass private information around pg_init/pg_doio. Signed-off-by: Peng Tao Signed-off-by: Boaz Harrosh Signed-off-by: Trond Myklebust --- fs/nfs/pagelist.c | 2 ++ include/linux/nfs_page.h | 1 + include/linux/nfs_xdr.h | 1 + 3 files changed, 4 insertions(+) (limited to 'include') diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 1a6732ed04a..311a79681e2 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -49,6 +49,7 @@ void nfs_pgheader_init(struct nfs_pageio_descriptor *desc, hdr->io_start = req_offset(hdr->req); hdr->good_bytes = desc->pg_count; hdr->dreq = desc->pg_dreq; + hdr->layout_private = desc->pg_layout_private; hdr->release = release; hdr->completion_ops = desc->pg_completion_ops; if (hdr->completion_ops->init_hdr) @@ -268,6 +269,7 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc, desc->pg_error = 0; desc->pg_lseg = NULL; desc->pg_dreq = NULL; + desc->pg_layout_private = NULL; } EXPORT_SYMBOL_GPL(nfs_pageio_init); diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h index 880805774f9..92ce5783b70 100644 --- a/include/linux/nfs_page.h +++ b/include/linux/nfs_page.h @@ -69,6 +69,7 @@ struct nfs_pageio_descriptor { const struct nfs_pgio_completion_ops *pg_completion_ops; struct pnfs_layout_segment *pg_lseg; struct nfs_direct_req *pg_dreq; + void *pg_layout_private; }; #define NFS_WBACK_BUSY(req) (test_bit(PG_BUSY,&(req)->wb_flags)) diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 00485e08439..ac7c8ae254f 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1248,6 +1248,7 @@ struct nfs_pgio_header { void (*release) (struct nfs_pgio_header *hdr); const struct nfs_pgio_completion_ops *completion_ops; struct nfs_direct_req *dreq; + void *layout_private; spinlock_t lock; /* fields protected by lock */ int pnfs_error; -- cgit v1.2.3-70-g09d2 From 095adbb6441172985f5ddc3b9e88cb3191bdeac4 Mon Sep 17 00:00:00 2001 From: Thomas Renninger Date: Tue, 31 Jul 2012 17:41:09 +0200 Subject: ACPI: Only count valid srat memory structures Otherwise you could run into: WARN_ON in numa_register_memblks(), because node_possible_map is zero References: https://bugzilla.novell.com/show_bug.cgi?id=757888 On this machine (ProLiant ML570 G3) the SRAT table contains: - No processor affinities - One memory affinity structure (which is set disabled) CC: Per Jessen CC: Andi Kleen Signed-off-by: Thomas Renninger Signed-off-by: Len Brown --- arch/ia64/kernel/acpi.c | 5 +++-- arch/x86/mm/srat.c | 15 ++++++++------- drivers/acpi/numa.c | 8 +++++--- include/linux/acpi.h | 2 +- 4 files changed, 17 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/arch/ia64/kernel/acpi.c b/arch/ia64/kernel/acpi.c index 6f38b6120d9..440578850ae 100644 --- a/arch/ia64/kernel/acpi.c +++ b/arch/ia64/kernel/acpi.c @@ -497,7 +497,7 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa) srat_num_cpus++; } -void __init +int __init acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) { unsigned long paddr, size; @@ -512,7 +512,7 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) /* Ignore disabled entries */ if (!(ma->flags & ACPI_SRAT_MEM_ENABLED)) - return; + return -1; /* record this node in proximity bitmap */ pxm_bit_set(pxm); @@ -531,6 +531,7 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) p->size = size; p->nid = pxm; num_node_memblks++; + return 0; } void __init acpi_numa_arch_fixup(void) diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c index 4599c3e8bcb..4ddf497ca65 100644 --- a/arch/x86/mm/srat.c +++ b/arch/x86/mm/srat.c @@ -142,23 +142,23 @@ static inline int save_add_info(void) {return 0;} #endif /* Callback for parsing of the Proximity Domain <-> Memory Area mappings */ -void __init +int __init acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) { u64 start, end; int node, pxm; if (srat_disabled()) - return; + return -1; if (ma->header.length != sizeof(struct acpi_srat_mem_affinity)) { bad_srat(); - return; + return -1; } if ((ma->flags & ACPI_SRAT_MEM_ENABLED) == 0) - return; + return -1; if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && !save_add_info()) - return; + return -1; start = ma->base_address; end = start + ma->length; pxm = ma->proximity_domain; @@ -168,12 +168,12 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) if (node < 0) { printk(KERN_ERR "SRAT: Too many proximity domains.\n"); bad_srat(); - return; + return -1; } if (numa_add_memblk(node, start, end) < 0) { bad_srat(); - return; + return -1; } node_set(node, numa_nodes_parsed); @@ -181,6 +181,7 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) printk(KERN_INFO "SRAT: Node %u PXM %u [mem %#010Lx-%#010Lx]\n", node, pxm, (unsigned long long) start, (unsigned long long) end - 1); + return 0; } void __init acpi_numa_arch_fixup(void) {} diff --git a/drivers/acpi/numa.c b/drivers/acpi/numa.c index 2a6399345c8..cb31298ca68 100644 --- a/drivers/acpi/numa.c +++ b/drivers/acpi/numa.c @@ -237,6 +237,8 @@ acpi_parse_processor_affinity(struct acpi_subtable_header *header, return 0; } +static int __initdata parsed_numa_memblks; + static int __init acpi_parse_memory_affinity(struct acpi_subtable_header * header, const unsigned long end) @@ -250,8 +252,8 @@ acpi_parse_memory_affinity(struct acpi_subtable_header * header, acpi_table_print_srat_entry(header); /* let architecture-dependent part to do it */ - acpi_numa_memory_affinity_init(memory_affinity); - + if (!acpi_numa_memory_affinity_init(memory_affinity)) + parsed_numa_memblks++; return 0; } @@ -306,7 +308,7 @@ int __init acpi_numa_init(void) if (cnt < 0) return cnt; - else if (cnt == 0) + else if (!parsed_numa_memblks) return -ENOENT; return 0; } diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 3ad510b2528..4f2a7622450 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -96,7 +96,7 @@ void acpi_table_print_madt_entry (struct acpi_subtable_header *madt); void acpi_numa_slit_init (struct acpi_table_slit *slit); void acpi_numa_processor_affinity_init (struct acpi_srat_cpu_affinity *pa); void acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa); -void acpi_numa_memory_affinity_init (struct acpi_srat_mem_affinity *ma); +int acpi_numa_memory_affinity_init (struct acpi_srat_mem_affinity *ma); void acpi_numa_arch_fixup(void); #ifdef CONFIG_ACPI_HOTPLUG_CPU -- cgit v1.2.3-70-g09d2 From 76582d0a68ab6c46987dd678165c26ad81c44cdb Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Wed, 25 Jul 2012 16:24:49 +0200 Subject: iommu: Include linux/types.h The linux/iommu.h header uses types defined in linux/types.h but doesn't include it. Signed-off-by: Thierry Reding Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 54d6d690073..0a85199191b 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -20,6 +20,7 @@ #define __LINUX_IOMMU_H #include +#include #define IOMMU_READ (1) #define IOMMU_WRITE (2) -- cgit v1.2.3-70-g09d2 From ba1eabfade3272596000bf3c62b68ca375e48b08 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 3 Aug 2012 15:55:41 +0200 Subject: iommu: Add missing forward declaration in include file The 'struct notifier_block' is not used in linux/iommu.h but not declared anywhere. Add a forward declaration for it. Reported-by: Thierry Reding Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 0a85199191b..7e83370e6fd 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -31,6 +31,7 @@ struct iommu_group; struct bus_type; struct device; struct iommu_domain; +struct notifier_block; /* iommu fault flags */ #define IOMMU_FAULT_READ 0x0 -- cgit v1.2.3-70-g09d2 From f0cd2dbb6cf387c11f87265462e370bb5469299e Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Wed, 25 Jul 2012 18:11:59 +0300 Subject: vfs: kill write_super and sync_supers Finally we can kill the 'sync_supers' kernel thread along with the '->write_super()' superblock operation because all the users are gone. Now every file-system is supposed to self-manage own superblock and its dirty state. The nice thing about killing this thread is that it improves power management. Indeed, 'sync_supers' is a source of monotonic system wake-ups - it woke up every 5 seconds no matter what - even if there were no dirty superblocks and even if there were no file-systems using this service (e.g., btrfs and journalled ext4 do not need it). So it was wasting power most of the time. And because the thread was in the core of the kernel, all systems had to have it. So I am quite happy to make it go away. Interestingly, this thread is a left-over from the pdflush kernel thread which was a self-forking kernel thread responsible for all the write-back in old Linux kernels. It was turned into per-block device BDI threads, and 'sync_supers' was a left-over. Thus, R.I.P, pdflush as well. Signed-off-by: Artem Bityutskiy Signed-off-by: Al Viro --- fs/super.c | 40 ---------------------------------- include/linux/backing-dev.h | 1 - include/linux/fs.h | 3 --- mm/backing-dev.c | 52 --------------------------------------------- mm/page-writeback.c | 1 - 5 files changed, 97 deletions(-) (limited to 'include') diff --git a/fs/super.c b/fs/super.c index b05cf47463d..0902cfa6a12 100644 --- a/fs/super.c +++ b/fs/super.c @@ -536,46 +536,6 @@ void drop_super(struct super_block *sb) EXPORT_SYMBOL(drop_super); -/** - * sync_supers - helper for periodic superblock writeback - * - * Call the write_super method if present on all dirty superblocks in - * the system. This is for the periodic writeback used by most older - * filesystems. For data integrity superblock writeback use - * sync_filesystems() instead. - * - * Note: check the dirty flag before waiting, so we don't - * hold up the sync while mounting a device. (The newly - * mounted device won't need syncing.) - */ -void sync_supers(void) -{ - struct super_block *sb, *p = NULL; - - spin_lock(&sb_lock); - list_for_each_entry(sb, &super_blocks, s_list) { - if (hlist_unhashed(&sb->s_instances)) - continue; - if (sb->s_op->write_super && sb->s_dirt) { - sb->s_count++; - spin_unlock(&sb_lock); - - down_read(&sb->s_umount); - if (sb->s_root && sb->s_dirt && (sb->s_flags & MS_BORN)) - sb->s_op->write_super(sb); - up_read(&sb->s_umount); - - spin_lock(&sb_lock); - if (p) - __put_super(p); - p = sb; - } - } - if (p) - __put_super(p); - spin_unlock(&sb_lock); -} - /** * iterate_supers - call function for all active superblocks * @f: function to call diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index c97c6b9cd38..2a9a9abc912 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -124,7 +124,6 @@ void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, void bdi_start_background_writeback(struct backing_dev_info *bdi); int bdi_writeback_thread(void *data); int bdi_has_dirty_io(struct backing_dev_info *bdi); -void bdi_arm_supers_timer(void); void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi); void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2); diff --git a/include/linux/fs.h b/include/linux/fs.h index 38dba16c417..aa110476a95 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1491,7 +1491,6 @@ struct sb_writers { struct super_block { struct list_head s_list; /* Keep this first */ dev_t s_dev; /* search index; _not_ kdev_t */ - unsigned char s_dirt; unsigned char s_blocksize_bits; unsigned long s_blocksize; loff_t s_maxbytes; /* Max file size */ @@ -1861,7 +1860,6 @@ struct super_operations { int (*drop_inode) (struct inode *); void (*evict_inode) (struct inode *); void (*put_super) (struct super_block *); - void (*write_super) (struct super_block *); int (*sync_fs)(struct super_block *sb, int wait); int (*freeze_fs) (struct super_block *); int (*unfreeze_fs) (struct super_block *); @@ -2397,7 +2395,6 @@ extern int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync); extern int vfs_fsync(struct file *file, int datasync); extern int generic_write_sync(struct file *file, loff_t pos, loff_t count); -extern void sync_supers(void); extern void emergency_sync(void); extern void emergency_remount(void); #ifdef CONFIG_BLOCK diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 6b4718e2ee3..b41823cc05e 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -39,12 +39,6 @@ DEFINE_SPINLOCK(bdi_lock); LIST_HEAD(bdi_list); LIST_HEAD(bdi_pending_list); -static struct task_struct *sync_supers_tsk; -static struct timer_list sync_supers_timer; - -static int bdi_sync_supers(void *); -static void sync_supers_timer_fn(unsigned long); - void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2) { if (wb1 < wb2) { @@ -250,12 +244,6 @@ static int __init default_bdi_init(void) { int err; - sync_supers_tsk = kthread_run(bdi_sync_supers, NULL, "sync_supers"); - BUG_ON(IS_ERR(sync_supers_tsk)); - - setup_timer(&sync_supers_timer, sync_supers_timer_fn, 0); - bdi_arm_supers_timer(); - err = bdi_init(&default_backing_dev_info); if (!err) bdi_register(&default_backing_dev_info, NULL, "default"); @@ -270,46 +258,6 @@ int bdi_has_dirty_io(struct backing_dev_info *bdi) return wb_has_dirty_io(&bdi->wb); } -/* - * kupdated() used to do this. We cannot do it from the bdi_forker_thread() - * or we risk deadlocking on ->s_umount. The longer term solution would be - * to implement sync_supers_bdi() or similar and simply do it from the - * bdi writeback thread individually. - */ -static int bdi_sync_supers(void *unused) -{ - set_user_nice(current, 0); - - while (!kthread_should_stop()) { - set_current_state(TASK_INTERRUPTIBLE); - schedule(); - - /* - * Do this periodically, like kupdated() did before. - */ - sync_supers(); - } - - return 0; -} - -void bdi_arm_supers_timer(void) -{ - unsigned long next; - - if (!dirty_writeback_interval) - return; - - next = msecs_to_jiffies(dirty_writeback_interval * 10) + jiffies; - mod_timer(&sync_supers_timer, round_jiffies_up(next)); -} - -static void sync_supers_timer_fn(unsigned long unused) -{ - wake_up_process(sync_supers_tsk); - bdi_arm_supers_timer(); -} - static void wakeup_timer_fn(unsigned long data) { struct backing_dev_info *bdi = (struct backing_dev_info *)data; diff --git a/mm/page-writeback.c b/mm/page-writeback.c index e5363f34e02..5ad5ce23c1e 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1532,7 +1532,6 @@ int dirty_writeback_centisecs_handler(ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { proc_dointvec(table, write, buffer, length, ppos); - bdi_arm_supers_timer(); return 0; } -- cgit v1.2.3-70-g09d2 From 0d5c3eba2e1e5aa74e097f49bc90b58f607e101c Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Wed, 25 Jul 2012 18:12:08 +0300 Subject: vfs: nuke pdflush from comments The pdflush thread is long gone, so this patch removes references to pdflush from vfs comments. Signed-off-by: Artem Bityutskiy Signed-off-by: Al Viro --- fs/bio.c | 2 +- include/linux/writeback.h | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) (limited to 'include') diff --git a/fs/bio.c b/fs/bio.c index 73922abba83..5eaa70c9d96 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -1312,7 +1312,7 @@ EXPORT_SYMBOL(bio_copy_kern); * Note that this code is very hard to test under normal circumstances because * direct-io pins the pages with get_user_pages(). This makes * is_page_cache_freeable return false, and the VM will not clean the pages. - * But other code (eg, pdflush) could clean the pages if they are mapped + * But other code (eg, flusher threads) could clean the pages if they are mapped * pagecache. * * Simply disabling the call to bio_set_pages_dirty() is a good way to test the diff --git a/include/linux/writeback.h b/include/linux/writeback.h index c66fe3332d8..50c3e8fa06a 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -104,7 +104,6 @@ static inline void wait_on_inode(struct inode *inode) wait_on_bit(&inode->i_state, __I_NEW, inode_wait, TASK_UNINTERRUPTIBLE); } - /* * mm/page-writeback.c */ -- cgit v1.2.3-70-g09d2 From d796c52ef0b71a988364f6109aeb63d79c5b116b Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sun, 5 Aug 2012 19:04:57 -0400 Subject: ext4: make sure the journal sb is written in ext4_clear_journal_err() After we transfer set the EXT4_ERROR_FS bit in the file system superblock, it's not enough to call jbd2_journal_clear_err() to clear the error indication from journal superblock --- we need to call jbd2_journal_update_sb_errno() as well. Otherwise, when the root file system is mounted read-only, the journal is replayed, and the error indicator is transferred to the superblock --- but the s_errno field in the jbd2 superblock is left set (since although we cleared it in memory, we never flushed it out to disk). This can end up confusing e2fsck. We should make e2fsck more robust in this case, but the kernel shouldn't be leaving things in this confused state, either. Signed-off-by: "Theodore Ts'o" Cc: stable@kernel.org --- fs/ext4/super.c | 1 + fs/jbd2/journal.c | 3 ++- include/linux/jbd2.h | 1 + 3 files changed, 4 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index d76ec8277d3..ccc4bcad561 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -4430,6 +4430,7 @@ static void ext4_clear_journal_err(struct super_block *sb, ext4_commit_super(sb, 1); jbd2_journal_clear_err(journal); + jbd2_journal_update_sb_errno(journal); } } diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index e9a3c4c8559..bd23f2ebaa6 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -1377,7 +1377,7 @@ static void jbd2_mark_journal_empty(journal_t *journal) * Update a journal's errno. Write updated superblock to disk waiting for IO * to complete. */ -static void jbd2_journal_update_sb_errno(journal_t *journal) +void jbd2_journal_update_sb_errno(journal_t *journal) { journal_superblock_t *sb = journal->j_superblock; @@ -1390,6 +1390,7 @@ static void jbd2_journal_update_sb_errno(journal_t *journal) jbd2_write_superblock(journal, WRITE_SYNC); } +EXPORT_SYMBOL(jbd2_journal_update_sb_errno); /* * Read the superblock for a given journal, performing initial diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index f334c7fab96..3efc43f3f16 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1125,6 +1125,7 @@ extern int jbd2_journal_destroy (journal_t *); extern int jbd2_journal_recover (journal_t *journal); extern int jbd2_journal_wipe (journal_t *, int); extern int jbd2_journal_skip_recovery (journal_t *); +extern void jbd2_journal_update_sb_errno(journal_t *); extern void jbd2_journal_update_sb_log_tail (journal_t *, tid_t, unsigned long, int); extern void __jbd2_journal_abort_hard (journal_t *); -- cgit v1.2.3-70-g09d2 From 5d299f3d3c8a2fbc732b1bf03af36333ccec3130 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 6 Aug 2012 05:09:33 +0000 Subject: net: ipv6: fix TCP early demux IPv6 needs a cookie in dst_check() call. We need to add rx_dst_cookie and provide a family independent sk_rx_dst_set(sk, skb) method to properly support IPv6 TCP early demux. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/ipv6.h | 1 + include/net/inet_connection_sock.h | 1 + include/net/inet_sock.h | 9 --------- net/ipv4/tcp_input.c | 4 +++- net/ipv4/tcp_ipv4.c | 13 ++++++++++--- net/ipv4/tcp_minisocks.c | 2 +- net/ipv6/tcp_ipv6.c | 27 +++++++++++++++++++++++++-- 7 files changed, 41 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 379e433e15e..879db26ec40 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -369,6 +369,7 @@ struct ipv6_pinfo { __u8 rcv_tclass; __u32 dst_cookie; + __u32 rx_dst_cookie; struct ipv6_mc_socklist __rcu *ipv6_mc_list; struct ipv6_ac_socklist *ipv6_ac_list; diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index 5ee66f517b4..ba1d3615acb 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -39,6 +39,7 @@ struct inet_connection_sock_af_ops { int (*queue_xmit)(struct sk_buff *skb, struct flowi *fl); void (*send_check)(struct sock *sk, struct sk_buff *skb); int (*rebuild_header)(struct sock *sk); + void (*sk_rx_dst_set)(struct sock *sk, const struct sk_buff *skb); int (*conn_request)(struct sock *sk, struct sk_buff *skb); struct sock *(*syn_recv_sock)(struct sock *sk, struct sk_buff *skb, struct request_sock *req, diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index 83b567fe194..613cfa40167 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -249,13 +249,4 @@ static inline __u8 inet_sk_flowi_flags(const struct sock *sk) return flags; } -static inline void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) -{ - struct dst_entry *dst = skb_dst(skb); - - dst_hold(dst); - sk->sk_rx_dst = dst; - inet_sk(sk)->rx_dst_ifindex = skb->skb_iif; -} - #endif /* _INET_SOCK_H */ diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 2fd2bc9e3c6..85308b90df8 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5392,6 +5392,8 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, { struct tcp_sock *tp = tcp_sk(sk); + if (unlikely(sk->sk_rx_dst == NULL)) + inet_csk(sk)->icsk_af_ops->sk_rx_dst_set(sk, skb); /* * Header prediction. * The code loosely follows the one in the famous @@ -5605,7 +5607,7 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb) tcp_set_state(sk, TCP_ESTABLISHED); if (skb != NULL) { - inet_sk_rx_dst_set(sk, skb); + icsk->icsk_af_ops->sk_rx_dst_set(sk, skb); security_inet_conn_established(sk, skb); } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 42b2a6a7309..272241f16fc 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1627,9 +1627,6 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) sk->sk_rx_dst = NULL; } } - if (unlikely(sk->sk_rx_dst == NULL)) - inet_sk_rx_dst_set(sk, skb); - if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) { rsk = sk; goto reset; @@ -1872,10 +1869,20 @@ static struct timewait_sock_ops tcp_timewait_sock_ops = { .twsk_destructor= tcp_twsk_destructor, }; +static void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + + dst_hold(dst); + sk->sk_rx_dst = dst; + inet_sk(sk)->rx_dst_ifindex = skb->skb_iif; +} + const struct inet_connection_sock_af_ops ipv4_specific = { .queue_xmit = ip_queue_xmit, .send_check = tcp_v4_send_check, .rebuild_header = inet_sk_rebuild_header, + .sk_rx_dst_set = inet_sk_rx_dst_set, .conn_request = tcp_v4_conn_request, .syn_recv_sock = tcp_v4_syn_recv_sock, .net_header_len = sizeof(struct iphdr), diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 232a90c3ec8..d9c9dcef2de 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -387,7 +387,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, struct tcp_sock *oldtp = tcp_sk(sk); struct tcp_cookie_values *oldcvp = oldtp->cookie_values; - inet_sk_rx_dst_set(newsk, skb); + newicsk->icsk_af_ops->sk_rx_dst_set(newsk, skb); /* TCP Cookie Transactions require space for the cookie pair, * as it differs for each connection. There is no need to diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index c66b90f71c9..5a439e9a4c0 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1447,7 +1447,17 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) opt_skb = skb_clone(skb, sk_gfp_atomic(sk, GFP_ATOMIC)); if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ + struct dst_entry *dst = sk->sk_rx_dst; + sock_rps_save_rxhash(sk, skb); + if (dst) { + if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif || + dst->ops->check(dst, np->rx_dst_cookie) == NULL) { + dst_release(dst); + sk->sk_rx_dst = NULL; + } + } + if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) goto reset; if (opt_skb) @@ -1705,9 +1715,9 @@ static void tcp_v6_early_demux(struct sk_buff *skb) struct dst_entry *dst = sk->sk_rx_dst; struct inet_sock *icsk = inet_sk(sk); if (dst) - dst = dst_check(dst, 0); + dst = dst_check(dst, inet6_sk(sk)->rx_dst_cookie); if (dst && - icsk->rx_dst_ifindex == inet6_iif(skb)) + icsk->rx_dst_ifindex == skb->skb_iif) skb_dst_set_noref(skb, dst); } } @@ -1719,10 +1729,23 @@ static struct timewait_sock_ops tcp6_timewait_sock_ops = { .twsk_destructor= tcp_twsk_destructor, }; +static void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + const struct rt6_info *rt = (const struct rt6_info *)dst; + + dst_hold(dst); + sk->sk_rx_dst = dst; + inet_sk(sk)->rx_dst_ifindex = skb->skb_iif; + if (rt->rt6i_node) + inet6_sk(sk)->rx_dst_cookie = rt->rt6i_node->fn_sernum; +} + static const struct inet_connection_sock_af_ops ipv6_specific = { .queue_xmit = inet6_csk_xmit, .send_check = tcp_v6_send_check, .rebuild_header = inet6_sk_rebuild_header, + .sk_rx_dst_set = inet6_sk_rx_dst_set, .conn_request = tcp_v6_conn_request, .syn_recv_sock = tcp_v6_syn_recv_sock, .net_header_len = sizeof(struct ipv6hdr), -- cgit v1.2.3-70-g09d2 From 035534ed3377d9def2c17717899fd64a111a785b Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Mon, 6 Aug 2012 18:02:29 +0200 Subject: canfd: remove redundant CAN FD flag The first idea of the CAN FD implementation started with a new struct canfd_frame to be used for both CAN FD frames and legacy CAN frames. The now mainlined implementation supports both CAN frame types simultaneously and distinguishes them only by their required sizes: CAN_MTU and CANFD_MTU. Only the struct canfd_frame contains a flags element which is needed for the additional CAN FD information. As CAN FD implicitly means that the 'Extened Data Length' mode is enabled the formerly defined CANFD_EDL bit became redundant and also confusing as an unset bit would be an error and would always need to be tested. This patch removes the obsolete CANFD_EDL bit and clarifies the documentation for the use of struct canfd_frame and the CAN FD relevant flags. Signed-off-by: Oliver Hartkopp Signed-off-by: Marc Kleine-Budde --- include/linux/can.h | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/linux/can.h b/include/linux/can.h index 018055efc03..e52958d7c2d 100644 --- a/include/linux/can.h +++ b/include/linux/can.h @@ -74,20 +74,21 @@ struct can_frame { /* * defined bits for canfd_frame.flags * - * As the default for CAN FD should be to support the high data rate in the - * payload section of the frame (HDR) and to support up to 64 byte in the - * data section (EDL) the bits are only set in the non-default case. - * Btw. as long as there's no real implementation for CAN FD network driver - * these bits are only preliminary. + * The use of struct canfd_frame implies the Extended Data Length (EDL) bit to + * be set in the CAN frame bitstream on the wire. The EDL bit switch turns + * the CAN controllers bitstream processor into the CAN FD mode which creates + * two new options within the CAN FD frame specification: * - * RX: NOHDR/NOEDL - info about received CAN FD frame - * ESI - bit from originating CAN controller - * TX: NOHDR/NOEDL - control per-frame settings if supported by CAN controller - * ESI - bit is set by local CAN controller + * Bit Rate Switch - to indicate a second bitrate is/was used for the payload + * Error State Indicator - represents the error state of the transmitting node + * + * As the CANFD_ESI bit is internally generated by the transmitting CAN + * controller only the CANFD_BRS bit is relevant for real CAN controllers when + * building a CAN FD frame for transmission. Setting the CANFD_ESI bit can make + * sense for virtual CAN interfaces to test applications with echoed frames. */ -#define CANFD_NOHDR 0x01 /* frame without high data rate */ -#define CANFD_NOEDL 0x02 /* frame without extended data length */ -#define CANFD_ESI 0x04 /* error state indicator */ +#define CANFD_BRS 0x01 /* bit rate switch (second bitrate for payload data) */ +#define CANFD_ESI 0x02 /* error state indicator of the transmitting node */ /** * struct canfd_frame - CAN flexible data rate frame structure -- cgit v1.2.3-70-g09d2 From 817fea2df3c24b22f6123dc0106eb063b7132883 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Tue, 7 Aug 2012 11:48:33 -0600 Subject: vfio: Include vfio.h in installed headers Signed-off-by: Alex Williamson --- include/linux/Kbuild | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/Kbuild b/include/linux/Kbuild index d9a75447487..fa217607c58 100644 --- a/include/linux/Kbuild +++ b/include/linux/Kbuild @@ -391,6 +391,7 @@ header-y += v4l2-dv-timings.h header-y += v4l2-mediabus.h header-y += v4l2-subdev.h header-y += veth.h +header-y += vfio.h header-y += vhost.h header-y += videodev2.h header-y += virtio_9p.h -- cgit v1.2.3-70-g09d2 From 300d3739e873d50d4c6e3656f89007a217fb1d29 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 7 Aug 2012 13:50:22 +0200 Subject: Revert "NMI watchdog: fix for lockup detector breakage on resume" Revert commit 45226e9 (NMI watchdog: fix for lockup detector breakage on resume) which breaks resume from system suspend on my SH7372 Mackerel board (by causing a NULL pointer dereference to happen) and is generally wrong, because it abuses the CPU hotplug functionality in a shamelessly blatant way. The original issue should be addressed through appropriate syscore resume callback instead. Signed-off-by: Rafael J. Wysocki --- include/linux/sched.h | 8 -------- kernel/power/suspend.c | 3 --- kernel/watchdog.c | 21 ++------------------- 3 files changed, 2 insertions(+), 30 deletions(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index c147e7024f1..b8c86648a2f 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -334,14 +334,6 @@ static inline void lockup_detector_init(void) } #endif -#if defined(CONFIG_LOCKUP_DETECTOR) && defined(CONFIG_SUSPEND) -void lockup_detector_bootcpu_resume(void); -#else -static inline void lockup_detector_bootcpu_resume(void) -{ -} -#endif - #ifdef CONFIG_DETECT_HUNG_TASK extern unsigned int sysctl_hung_task_panic; extern unsigned long sysctl_hung_task_check_count; diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 1da39ea248f..c8b7446b27d 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -178,9 +178,6 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) arch_suspend_enable_irqs(); BUG_ON(irqs_disabled()); - /* Kick the lockup detector */ - lockup_detector_bootcpu_resume(); - Enable_cpus: enable_nonboot_cpus(); diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 69add8a9da6..4b1dfba70f7 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -575,7 +575,7 @@ out: /* * Create/destroy watchdog threads as CPUs come and go: */ -static int +static int __cpuinit cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { int hotcpu = (unsigned long)hcpu; @@ -610,27 +610,10 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) return NOTIFY_OK; } -static struct notifier_block cpu_nfb = { +static struct notifier_block __cpuinitdata cpu_nfb = { .notifier_call = cpu_callback }; -#ifdef CONFIG_SUSPEND -/* - * On exit from suspend we force an offline->online transition on the boot CPU - * so that the PMU state that was lost while in suspended state gets set up - * properly for the boot CPU. This information is required for restarting the - * NMI watchdog. - */ -void lockup_detector_bootcpu_resume(void) -{ - void *cpu = (void *)(long)smp_processor_id(); - - cpu_callback(&cpu_nfb, CPU_DEAD_FROZEN, cpu); - cpu_callback(&cpu_nfb, CPU_UP_PREPARE_FROZEN, cpu); - cpu_callback(&cpu_nfb, CPU_ONLINE_FROZEN, cpu); -} -#endif - void __init lockup_detector_init(void) { void *cpu = (void *)(long)smp_processor_id(); -- cgit v1.2.3-70-g09d2 From a37e6e344910a43b9ebc2bbf29a029f5ea942598 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 7 Aug 2012 10:55:45 +0000 Subject: net: force dst_default_metrics to const section While investigating on network performance problems, I found this little gem : $ nm -v vmlinux | grep -1 dst_default_metrics ffffffff82736540 b busy.46605 ffffffff82736560 B dst_default_metrics ffffffff82736598 b dst_busy_list Apparently, declaring a const array without initializer put it in (writeable) bss section, in middle of possibly often dirtied cache lines. Since we really want dst_default_metrics be const to avoid any possible false sharing and catch any buggy writes, I force a null initializer. ffffffff818a4c20 R dst_default_metrics Signed-off-by: Eric Dumazet Cc: Ben Hutchings Signed-off-by: David S. Miller --- include/net/dst.h | 2 +- net/core/dst.c | 10 +++++++++- 2 files changed, 10 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/dst.h b/include/net/dst.h index baf59789006..621e3513ef5 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -110,7 +110,7 @@ struct dst_entry { }; extern u32 *dst_cow_metrics_generic(struct dst_entry *dst, unsigned long old); -extern const u32 dst_default_metrics[RTAX_MAX]; +extern const u32 dst_default_metrics[]; #define DST_METRICS_READ_ONLY 0x1UL #define __DST_METRICS_PTR(Y) \ diff --git a/net/core/dst.c b/net/core/dst.c index 069d51d2941..56d63612e1e 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -149,7 +149,15 @@ int dst_discard(struct sk_buff *skb) } EXPORT_SYMBOL(dst_discard); -const u32 dst_default_metrics[RTAX_MAX]; +const u32 dst_default_metrics[RTAX_MAX + 1] = { + /* This initializer is needed to force linker to place this variable + * into const section. Otherwise it might end into bss section. + * We really want to avoid false sharing on this variable, and catch + * any writes on it. + */ + [RTAX_MAX] = 0xdeadbeef, +}; + void *dst_alloc(struct dst_ops *ops, struct net_device *dev, int initial_ref, int initial_obsolete, unsigned short flags) -- cgit v1.2.3-70-g09d2 From 59ee93a528b94ef4e81a08db252b0326feff171f Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Sun, 5 Aug 2012 14:58:37 +0000 Subject: ARM: pxa: remove irq_to_gpio from ezx-pcap driver The irq_to_gpio function was removed from the pxa platform in linux-3.2, and this driver has been broken since. There is actually no in-tree user of this driver that adds this platform device, but the driver can and does get enabled on some platforms. Without this patch, building ezx_defconfig results in: drivers/mfd/ezx-pcap.c: In function 'pcap_isr_work': drivers/mfd/ezx-pcap.c:205:2: error: implicit declaration of function 'irq_to_gpio' [-Werror=implicit-function-declaration] Signed-off-by: Arnd Bergmann Acked-by: Haojian Zhuang Cc: stable@vger.kernel.org (v3.2+) Cc: Samuel Ortiz Cc: Daniel Ribeiro --- drivers/mfd/ezx-pcap.c | 2 +- include/linux/mfd/ezx-pcap.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/drivers/mfd/ezx-pcap.c b/drivers/mfd/ezx-pcap.c index 43a76c41cfc..db662e2dcfa 100644 --- a/drivers/mfd/ezx-pcap.c +++ b/drivers/mfd/ezx-pcap.c @@ -202,7 +202,7 @@ static void pcap_isr_work(struct work_struct *work) } local_irq_enable(); ezx_pcap_write(pcap, PCAP_REG_MSR, pcap->msr); - } while (gpio_get_value(irq_to_gpio(pcap->spi->irq))); + } while (gpio_get_value(pdata->gpio)); } static void pcap_irq_handler(unsigned int irq, struct irq_desc *desc) diff --git a/include/linux/mfd/ezx-pcap.h b/include/linux/mfd/ezx-pcap.h index 40c372165f3..32a1b5cfeba 100644 --- a/include/linux/mfd/ezx-pcap.h +++ b/include/linux/mfd/ezx-pcap.h @@ -16,6 +16,7 @@ struct pcap_subdev { struct pcap_platform_data { unsigned int irq_base; unsigned int config; + int gpio; void (*init) (void *); /* board specific init */ int num_subdevs; struct pcap_subdev *subdevs; -- cgit v1.2.3-70-g09d2 From 4eef6cbfcc03b294d9d334368a851b35b496ce53 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 30 Apr 2012 16:21:37 +0000 Subject: Input: eeti_ts: pass gpio value instead of IRQ The EETI touchscreen asserts its IRQ line as soon as it has data in its internal buffers. The line is automatically deasserted once all data has been read via I2C. Hence, the driver has to monitor the GPIO line and cannot simply rely on the interrupt handler reception. In the current implementation of the driver, irq_to_gpio() is used to determine the GPIO number from the i2c_client's IRQ value. As irq_to_gpio() is not available on all platforms, this patch changes this and makes the driver ignore the passed in IRQ. Instead, a GPIO is added to the platform_data struct and gpio_to_irq is used to derive the IRQ from that GPIO. If this fails, bail out. The driver is only able to work in environments where the touchscreen GPIO can be mapped to an IRQ. Without this patch, building raumfeld_defconfig results in: drivers/input/touchscreen/eeti_ts.c: In function 'eeti_ts_irq_active': drivers/input/touchscreen/eeti_ts.c:65:2: error: implicit declaration of function 'irq_to_gpio' [-Werror=implicit-function-declaration] Signed-off-by: Daniel Mack Signed-off-by: Arnd Bergmann Cc: stable@vger.kernel.org (v3.2+) Cc: Dmitry Torokhov Cc: Sven Neumann Cc: linux-input@vger.kernel.org Cc: Haojian Zhuang --- arch/arm/mach-pxa/raumfeld.c | 2 +- drivers/input/touchscreen/eeti_ts.c | 21 +++++++++++++-------- include/linux/input/eeti_ts.h | 1 + 3 files changed, 15 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/arch/arm/mach-pxa/raumfeld.c b/arch/arm/mach-pxa/raumfeld.c index 5905ed130e9..d89d87ae144 100644 --- a/arch/arm/mach-pxa/raumfeld.c +++ b/arch/arm/mach-pxa/raumfeld.c @@ -953,12 +953,12 @@ static struct i2c_board_info raumfeld_connector_i2c_board_info __initdata = { static struct eeti_ts_platform_data eeti_ts_pdata = { .irq_active_high = 1, + .irq_gpio = GPIO_TOUCH_IRQ, }; static struct i2c_board_info raumfeld_controller_i2c_board_info __initdata = { .type = "eeti_ts", .addr = 0x0a, - .irq = PXA_GPIO_TO_IRQ(GPIO_TOUCH_IRQ), .platform_data = &eeti_ts_pdata, }; diff --git a/drivers/input/touchscreen/eeti_ts.c b/drivers/input/touchscreen/eeti_ts.c index 503c7096ed3..908407efc67 100644 --- a/drivers/input/touchscreen/eeti_ts.c +++ b/drivers/input/touchscreen/eeti_ts.c @@ -48,7 +48,7 @@ struct eeti_ts_priv { struct input_dev *input; struct work_struct work; struct mutex mutex; - int irq, irq_active_high; + int irq_gpio, irq, irq_active_high; }; #define EETI_TS_BITDEPTH (11) @@ -62,7 +62,7 @@ struct eeti_ts_priv { static inline int eeti_ts_irq_active(struct eeti_ts_priv *priv) { - return gpio_get_value(irq_to_gpio(priv->irq)) == priv->irq_active_high; + return gpio_get_value(priv->irq_gpio) == priv->irq_active_high; } static void eeti_ts_read(struct work_struct *work) @@ -157,7 +157,7 @@ static void eeti_ts_close(struct input_dev *dev) static int __devinit eeti_ts_probe(struct i2c_client *client, const struct i2c_device_id *idp) { - struct eeti_ts_platform_data *pdata; + struct eeti_ts_platform_data *pdata = client->dev.platform_data; struct eeti_ts_priv *priv; struct input_dev *input; unsigned int irq_flags; @@ -199,9 +199,12 @@ static int __devinit eeti_ts_probe(struct i2c_client *client, priv->client = client; priv->input = input; - priv->irq = client->irq; + priv->irq_gpio = pdata->irq_gpio; + priv->irq = gpio_to_irq(pdata->irq_gpio); - pdata = client->dev.platform_data; + err = gpio_request_one(pdata->irq_gpio, GPIOF_IN, client->name); + if (err < 0) + goto err1; if (pdata) priv->irq_active_high = pdata->irq_active_high; @@ -215,13 +218,13 @@ static int __devinit eeti_ts_probe(struct i2c_client *client, err = input_register_device(input); if (err) - goto err1; + goto err2; err = request_irq(priv->irq, eeti_ts_isr, irq_flags, client->name, priv); if (err) { dev_err(&client->dev, "Unable to request touchscreen IRQ.\n"); - goto err2; + goto err3; } /* @@ -233,9 +236,11 @@ static int __devinit eeti_ts_probe(struct i2c_client *client, device_init_wakeup(&client->dev, 0); return 0; -err2: +err3: input_unregister_device(input); input = NULL; /* so we dont try to free it below */ +err2: + gpio_free(pdata->irq_gpio); err1: input_free_device(input); kfree(priv); diff --git a/include/linux/input/eeti_ts.h b/include/linux/input/eeti_ts.h index f875b316249..16625d799b6 100644 --- a/include/linux/input/eeti_ts.h +++ b/include/linux/input/eeti_ts.h @@ -2,6 +2,7 @@ #define LINUX_INPUT_EETI_TS_H struct eeti_ts_platform_data { + int irq_gpio; unsigned int irq_active_high; }; -- cgit v1.2.3-70-g09d2 From 63d02d157ec4124990258d66517b6c11fd6df0cf Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 9 Aug 2012 14:11:00 +0000 Subject: net: tcp: ipv6_mapped needs sk_rx_dst_set method commit 5d299f3d3c8a2fb (net: ipv6: fix TCP early demux) added a regression for ipv6_mapped case. [ 67.422369] SELinux: initialized (dev autofs, type autofs), uses genfs_contexts [ 67.449678] SELinux: initialized (dev autofs, type autofs), uses genfs_contexts [ 92.631060] BUG: unable to handle kernel NULL pointer dereference at (null) [ 92.631435] IP: [< (null)>] (null) [ 92.631645] PGD 0 [ 92.631846] Oops: 0010 [#1] SMP [ 92.632095] Modules linked in: autofs4 sunrpc ipv6 dm_mirror dm_region_hash dm_log dm_multipath dm_mod video sbs sbshc battery ac lp parport sg snd_hda_intel snd_hda_codec snd_seq_oss snd_seq_midi_event snd_seq snd_seq_device pcspkr snd_pcm_oss snd_mixer_oss snd_pcm snd_timer serio_raw button floppy snd i2c_i801 i2c_core soundcore snd_page_alloc shpchp ide_cd_mod cdrom microcode ehci_hcd ohci_hcd uhci_hcd [ 92.634294] CPU 0 [ 92.634294] Pid: 4469, comm: sendmail Not tainted 3.6.0-rc1 #3 [ 92.634294] RIP: 0010:[<0000000000000000>] [< (null)>] (null) [ 92.634294] RSP: 0018:ffff880245fc7cb0 EFLAGS: 00010282 [ 92.634294] RAX: ffffffffa01985f0 RBX: ffff88024827ad00 RCX: 0000000000000000 [ 92.634294] RDX: 0000000000000218 RSI: ffff880254735380 RDI: ffff88024827ad00 [ 92.634294] RBP: ffff880245fc7cc8 R08: 0000000000000001 R09: 0000000000000000 [ 92.634294] R10: 0000000000000000 R11: ffff880245fc7bf8 R12: ffff880254735380 [ 92.634294] R13: ffff880254735380 R14: 0000000000000000 R15: 7fffffffffff0218 [ 92.634294] FS: 00007f4516ccd6f0(0000) GS:ffff880256600000(0000) knlGS:0000000000000000 [ 92.634294] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b [ 92.634294] CR2: 0000000000000000 CR3: 0000000245ed1000 CR4: 00000000000007f0 [ 92.634294] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 92.634294] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 [ 92.634294] Process sendmail (pid: 4469, threadinfo ffff880245fc6000, task ffff880254b8cac0) [ 92.634294] Stack: [ 92.634294] ffffffff813837a7 ffff88024827ad00 ffff880254b6b0e8 ffff880245fc7d68 [ 92.634294] ffffffff81385083 00000000001d2680 ffff8802547353a8 ffff880245fc7d18 [ 92.634294] ffffffff8105903a ffff88024827ad60 0000000000000002 00000000000000ff [ 92.634294] Call Trace: [ 92.634294] [] ? tcp_finish_connect+0x2c/0xfa [ 92.634294] [] tcp_rcv_state_process+0x2b6/0x9c6 [ 92.634294] [] ? sched_clock_cpu+0xc3/0xd1 [ 92.634294] [] ? local_clock+0x2b/0x3c [ 92.634294] [] tcp_v4_do_rcv+0x63a/0x670 [ 92.634294] [] release_sock+0x128/0x1bd [ 92.634294] [] __inet_stream_connect+0x1b1/0x352 [ 92.634294] [] ? lock_sock_nested+0x74/0x7f [ 92.634294] [] ? wake_up_bit+0x25/0x25 [ 92.634294] [] ? lock_sock_nested+0x74/0x7f [ 92.634294] [] ? inet_stream_connect+0x22/0x4b [ 92.634294] [] inet_stream_connect+0x33/0x4b [ 92.634294] [] sys_connect+0x78/0x9e [ 92.634294] [] ? sysret_check+0x1b/0x56 [ 92.634294] [] ? __audit_syscall_entry+0x195/0x1c8 [ 92.634294] [] ? trace_hardirqs_on_thunk+0x3a/0x3f [ 92.634294] [] system_call_fastpath+0x16/0x1b [ 92.634294] Code: Bad RIP value. [ 92.634294] RIP [< (null)>] (null) [ 92.634294] RSP [ 92.634294] CR2: 0000000000000000 [ 92.648982] ---[ end trace 24e2bed94314c8d9 ]--- [ 92.649146] Kernel panic - not syncing: Fatal exception in interrupt Fix this using inet_sk_rx_dst_set(), and export this function in case IPv6 is modular. Reported-by: Andrew Morton Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 1 + net/ipv4/tcp_ipv4.c | 3 ++- net/ipv6/tcp_ipv6.c | 1 + 3 files changed, 4 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index e19124b84cd..1f000ffe707 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -464,6 +464,7 @@ extern int tcp_disconnect(struct sock *sk, int flags); void tcp_connect_init(struct sock *sk); void tcp_finish_connect(struct sock *sk, struct sk_buff *skb); int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size); +void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb); /* From syncookies.c */ extern __u32 syncookie_secret[2][16-4+SHA_DIGEST_WORDS]; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 272241f16fc..76782376401 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1869,7 +1869,7 @@ static struct timewait_sock_ops tcp_timewait_sock_ops = { .twsk_destructor= tcp_twsk_destructor, }; -static void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) +void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); @@ -1877,6 +1877,7 @@ static void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) sk->sk_rx_dst = dst; inet_sk(sk)->rx_dst_ifindex = skb->skb_iif; } +EXPORT_SYMBOL(inet_sk_rx_dst_set); const struct inet_connection_sock_af_ops ipv4_specific = { .queue_xmit = ip_queue_xmit, diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 5a439e9a4c0..bb9ce2b2f37 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1777,6 +1777,7 @@ static const struct inet_connection_sock_af_ops ipv6_mapped = { .queue_xmit = ip_queue_xmit, .send_check = tcp_v4_send_check, .rebuild_header = inet_sk_rebuild_header, + .sk_rx_dst_set = inet_sk_rx_dst_set, .conn_request = tcp_v6_conn_request, .syn_recv_sock = tcp_v6_syn_recv_sock, .net_header_len = sizeof(struct iphdr), -- cgit v1.2.3-70-g09d2 From 02b69cbdc2fb2e1bfbfd9ac0c246d7be1b08d3cd Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Thu, 9 Aug 2012 10:08:46 +0000 Subject: netfilter: nf_ct_sip: fix IPv6 address parsing Within SIP messages IPv6 addresses are enclosed in square brackets in most cases, with the exception of the "received=" header parameter. Currently the helper fails to parse enclosed addresses. This patch: - changes the SIP address parsing function to enforce square brackets when required, and accept them when not required but present, as recommended by RFC 5118. - adds a new SDP address parsing function that never accepts square brackets since SDP doesn't use them. With these changes, the SIP helper correctly parses all test messages from RFC 5118 (Session Initiation Protocol (SIP) Torture Test Messages for Internet Protocol Version 6 (IPv6)). Signed-off-by: Patrick McHardy Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/nf_conntrack_sip.h | 2 +- net/ipv4/netfilter/nf_nat_sip.c | 4 +- net/netfilter/nf_conntrack_sip.c | 87 ++++++++++++++++++++++++------ 3 files changed, 73 insertions(+), 20 deletions(-) (limited to 'include') diff --git a/include/linux/netfilter/nf_conntrack_sip.h b/include/linux/netfilter/nf_conntrack_sip.h index 0dfc8b7210a..89f2a627f3f 100644 --- a/include/linux/netfilter/nf_conntrack_sip.h +++ b/include/linux/netfilter/nf_conntrack_sip.h @@ -164,7 +164,7 @@ extern int ct_sip_parse_address_param(const struct nf_conn *ct, const char *dptr unsigned int dataoff, unsigned int datalen, const char *name, unsigned int *matchoff, unsigned int *matchlen, - union nf_inet_addr *addr); + union nf_inet_addr *addr, bool delim); extern int ct_sip_parse_numerical_param(const struct nf_conn *ct, const char *dptr, unsigned int off, unsigned int datalen, const char *name, diff --git a/net/ipv4/netfilter/nf_nat_sip.c b/net/ipv4/netfilter/nf_nat_sip.c index ea4a23813d2..eef8f29e8bf 100644 --- a/net/ipv4/netfilter/nf_nat_sip.c +++ b/net/ipv4/netfilter/nf_nat_sip.c @@ -173,7 +173,7 @@ static unsigned int ip_nat_sip(struct sk_buff *skb, unsigned int dataoff, * the reply. */ if (ct_sip_parse_address_param(ct, *dptr, matchend, *datalen, "maddr=", &poff, &plen, - &addr) > 0 && + &addr, true) > 0 && addr.ip == ct->tuplehash[dir].tuple.src.u3.ip && addr.ip != ct->tuplehash[!dir].tuple.dst.u3.ip) { buflen = sprintf(buffer, "%pI4", @@ -187,7 +187,7 @@ static unsigned int ip_nat_sip(struct sk_buff *skb, unsigned int dataoff, * from which the server received the request. */ if (ct_sip_parse_address_param(ct, *dptr, matchend, *datalen, "received=", &poff, &plen, - &addr) > 0 && + &addr, false) > 0 && addr.ip == ct->tuplehash[dir].tuple.dst.u3.ip && addr.ip != ct->tuplehash[!dir].tuple.src.u3.ip) { buflen = sprintf(buffer, "%pI4", diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c index 2fb666920cc..5c0a112aeee 100644 --- a/net/netfilter/nf_conntrack_sip.c +++ b/net/netfilter/nf_conntrack_sip.c @@ -183,12 +183,12 @@ static int media_len(const struct nf_conn *ct, const char *dptr, return len + digits_len(ct, dptr, limit, shift); } -static int parse_addr(const struct nf_conn *ct, const char *cp, - const char **endp, union nf_inet_addr *addr, - const char *limit) +static int sip_parse_addr(const struct nf_conn *ct, const char *cp, + const char **endp, union nf_inet_addr *addr, + const char *limit, bool delim) { const char *end; - int ret = 0; + int ret; if (!ct) return 0; @@ -197,16 +197,28 @@ static int parse_addr(const struct nf_conn *ct, const char *cp, switch (nf_ct_l3num(ct)) { case AF_INET: ret = in4_pton(cp, limit - cp, (u8 *)&addr->ip, -1, &end); + if (ret == 0) + return 0; break; case AF_INET6: + if (cp < limit && *cp == '[') + cp++; + else if (delim) + return 0; + ret = in6_pton(cp, limit - cp, (u8 *)&addr->ip6, -1, &end); + if (ret == 0) + return 0; + + if (end < limit && *end == ']') + end++; + else if (delim) + return 0; break; default: BUG(); } - if (ret == 0 || end == cp) - return 0; if (endp) *endp = end; return 1; @@ -219,7 +231,7 @@ static int epaddr_len(const struct nf_conn *ct, const char *dptr, union nf_inet_addr addr; const char *aux = dptr; - if (!parse_addr(ct, dptr, &dptr, &addr, limit)) { + if (!sip_parse_addr(ct, dptr, &dptr, &addr, limit, true)) { pr_debug("ip: %s parse failed.!\n", dptr); return 0; } @@ -296,7 +308,7 @@ int ct_sip_parse_request(const struct nf_conn *ct, return 0; dptr += shift; - if (!parse_addr(ct, dptr, &end, addr, limit)) + if (!sip_parse_addr(ct, dptr, &end, addr, limit, true)) return -1; if (end < limit && *end == ':') { end++; @@ -550,7 +562,7 @@ int ct_sip_parse_header_uri(const struct nf_conn *ct, const char *dptr, if (ret == 0) return ret; - if (!parse_addr(ct, dptr + *matchoff, &c, addr, limit)) + if (!sip_parse_addr(ct, dptr + *matchoff, &c, addr, limit, true)) return -1; if (*c == ':') { c++; @@ -599,7 +611,7 @@ int ct_sip_parse_address_param(const struct nf_conn *ct, const char *dptr, unsigned int dataoff, unsigned int datalen, const char *name, unsigned int *matchoff, unsigned int *matchlen, - union nf_inet_addr *addr) + union nf_inet_addr *addr, bool delim) { const char *limit = dptr + datalen; const char *start, *end; @@ -613,7 +625,7 @@ int ct_sip_parse_address_param(const struct nf_conn *ct, const char *dptr, return 0; start += strlen(name); - if (!parse_addr(ct, start, &end, addr, limit)) + if (!sip_parse_addr(ct, start, &end, addr, limit, delim)) return 0; *matchoff = start - dptr; *matchlen = end - start; @@ -675,6 +687,47 @@ static int ct_sip_parse_transport(struct nf_conn *ct, const char *dptr, return 1; } +static int sdp_parse_addr(const struct nf_conn *ct, const char *cp, + const char **endp, union nf_inet_addr *addr, + const char *limit) +{ + const char *end; + int ret; + + memset(addr, 0, sizeof(*addr)); + switch (nf_ct_l3num(ct)) { + case AF_INET: + ret = in4_pton(cp, limit - cp, (u8 *)&addr->ip, -1, &end); + break; + case AF_INET6: + ret = in6_pton(cp, limit - cp, (u8 *)&addr->ip6, -1, &end); + break; + default: + BUG(); + } + + if (ret == 0) + return 0; + if (endp) + *endp = end; + return 1; +} + +/* skip ip address. returns its length. */ +static int sdp_addr_len(const struct nf_conn *ct, const char *dptr, + const char *limit, int *shift) +{ + union nf_inet_addr addr; + const char *aux = dptr; + + if (!sdp_parse_addr(ct, dptr, &dptr, &addr, limit)) { + pr_debug("ip: %s parse failed.!\n", dptr); + return 0; + } + + return dptr - aux; +} + /* SDP header parsing: a SDP session description contains an ordered set of * headers, starting with a section containing general session parameters, * optionally followed by multiple media descriptions. @@ -686,10 +739,10 @@ static int ct_sip_parse_transport(struct nf_conn *ct, const char *dptr, */ static const struct sip_header ct_sdp_hdrs[] = { [SDP_HDR_VERSION] = SDP_HDR("v=", NULL, digits_len), - [SDP_HDR_OWNER_IP4] = SDP_HDR("o=", "IN IP4 ", epaddr_len), - [SDP_HDR_CONNECTION_IP4] = SDP_HDR("c=", "IN IP4 ", epaddr_len), - [SDP_HDR_OWNER_IP6] = SDP_HDR("o=", "IN IP6 ", epaddr_len), - [SDP_HDR_CONNECTION_IP6] = SDP_HDR("c=", "IN IP6 ", epaddr_len), + [SDP_HDR_OWNER_IP4] = SDP_HDR("o=", "IN IP4 ", sdp_addr_len), + [SDP_HDR_CONNECTION_IP4] = SDP_HDR("c=", "IN IP4 ", sdp_addr_len), + [SDP_HDR_OWNER_IP6] = SDP_HDR("o=", "IN IP6 ", sdp_addr_len), + [SDP_HDR_CONNECTION_IP6] = SDP_HDR("c=", "IN IP6 ", sdp_addr_len), [SDP_HDR_MEDIA] = SDP_HDR("m=", NULL, media_len), }; @@ -775,8 +828,8 @@ static int ct_sip_parse_sdp_addr(const struct nf_conn *ct, const char *dptr, if (ret <= 0) return ret; - if (!parse_addr(ct, dptr + *matchoff, NULL, addr, - dptr + *matchoff + *matchlen)) + if (!sdp_parse_addr(ct, dptr + *matchoff, NULL, addr, + dptr + *matchoff + *matchlen)) return -1; return 1; } -- cgit v1.2.3-70-g09d2 From 9d8dad742ad1c74d7e7210ee05d0b44961d5ea16 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 9 Aug 2012 19:01:26 -0700 Subject: Yama: higher restrictions should block PTRACE_TRACEME The higher ptrace restriction levels should be blocking even PTRACE_TRACEME requests. The comments in the LSM documentation are misleading about when the checks happen (the parent does not go through security_ptrace_access_check() on a PTRACE_TRACEME call). Signed-off-by: Kees Cook Cc: stable@vger.kernel.org # 3.5.x and later Signed-off-by: James Morris --- Documentation/security/Yama.txt | 14 +++++++------- include/linux/security.h | 2 -- security/yama/yama_lsm.c | 41 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 48 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/Documentation/security/Yama.txt b/Documentation/security/Yama.txt index e369de2d48c..dd908cf64ec 100644 --- a/Documentation/security/Yama.txt +++ b/Documentation/security/Yama.txt @@ -46,14 +46,13 @@ restrictions, it can call prctl(PR_SET_PTRACER, PR_SET_PTRACER_ANY, ...) so that any otherwise allowed process (even those in external pid namespaces) may attach. -These restrictions do not change how ptrace via PTRACE_TRACEME operates. - -The sysctl settings are: +The sysctl settings (writable only with CAP_SYS_PTRACE) are: 0 - classic ptrace permissions: a process can PTRACE_ATTACH to any other process running under the same uid, as long as it is dumpable (i.e. did not transition uids, start privileged, or have called - prctl(PR_SET_DUMPABLE...) already). + prctl(PR_SET_DUMPABLE...) already). Similarly, PTRACE_TRACEME is + unchanged. 1 - restricted ptrace: a process must have a predefined relationship with the inferior it wants to call PTRACE_ATTACH on. By default, @@ -61,12 +60,13 @@ The sysctl settings are: classic criteria is also met. To change the relationship, an inferior can call prctl(PR_SET_PTRACER, debugger, ...) to declare an allowed debugger PID to call PTRACE_ATTACH on the inferior. + Using PTRACE_TRACEME is unchanged. 2 - admin-only attach: only processes with CAP_SYS_PTRACE may use ptrace - with PTRACE_ATTACH. + with PTRACE_ATTACH, or through children calling PTRACE_TRACEME. -3 - no attach: no processes may use ptrace with PTRACE_ATTACH. Once set, - this sysctl cannot be changed to a lower value. +3 - no attach: no processes may use ptrace with PTRACE_ATTACH nor via + PTRACE_TRACEME. Once set, this sysctl value cannot be changed. The original children-only logic was based on the restrictions in grsecurity. diff --git a/include/linux/security.h b/include/linux/security.h index 4e5a73cdbbe..3dea6a9d568 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -1242,8 +1242,6 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts) * Check that the @parent process has sufficient permission to trace the * current process before allowing the current process to present itself * to the @parent process for tracing. - * The parent process will still have to undergo the ptrace_access_check - * checks before it is allowed to trace this one. * @parent contains the task_struct structure for debugger process. * Return 0 if permission is granted. * @capget: diff --git a/security/yama/yama_lsm.c b/security/yama/yama_lsm.c index 83554ee8a58..d51b7c76c37 100644 --- a/security/yama/yama_lsm.c +++ b/security/yama/yama_lsm.c @@ -290,10 +290,51 @@ static int yama_ptrace_access_check(struct task_struct *child, return rc; } +/** + * yama_ptrace_traceme - validate PTRACE_TRACEME calls + * @parent: task that will become the ptracer of the current task + * + * Returns 0 if following the ptrace is allowed, -ve on error. + */ +static int yama_ptrace_traceme(struct task_struct *parent) +{ + int rc; + + /* If standard caps disallows it, so does Yama. We should + * only tighten restrictions further. + */ + rc = cap_ptrace_traceme(parent); + if (rc) + return rc; + + /* Only disallow PTRACE_TRACEME on more aggressive settings. */ + switch (ptrace_scope) { + case YAMA_SCOPE_CAPABILITY: + if (!ns_capable(task_user_ns(parent), CAP_SYS_PTRACE)) + rc = -EPERM; + break; + case YAMA_SCOPE_NO_ATTACH: + rc = -EPERM; + break; + } + + if (rc) { + char name[sizeof(current->comm)]; + printk_ratelimited(KERN_NOTICE + "ptraceme of pid %d was attempted by: %s (pid %d)\n", + current->pid, + get_task_comm(name, parent), + parent->pid); + } + + return rc; +} + static struct security_operations yama_ops = { .name = "yama", .ptrace_access_check = yama_ptrace_access_check, + .ptrace_traceme = yama_ptrace_traceme, .task_prctl = yama_task_prctl, .task_free = yama_task_free, }; -- cgit v1.2.3-70-g09d2 From b5ec8eeac46a99004c26791f70b15d001e970acf Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 10 Aug 2012 02:22:47 +0000 Subject: ipv4: fix ip_send_skb() ip_send_skb() can send orphaned skb, so we must pass the net pointer to avoid possible NULL dereference in error path. Bug added by commit 3a7c384ffd57 (ipv4: tcp: unicast_sock should not land outside of TCP stack) Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/ip.h | 2 +- net/ipv4/ip_output.c | 5 ++--- net/ipv4/udp.c | 2 +- 3 files changed, 4 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/net/ip.h b/include/net/ip.h index bd5e444a19c..5a5d84d3d2c 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -120,7 +120,7 @@ extern struct sk_buff *__ip_make_skb(struct sock *sk, struct flowi4 *fl4, struct sk_buff_head *queue, struct inet_cork *cork); -extern int ip_send_skb(struct sk_buff *skb); +extern int ip_send_skb(struct net *net, struct sk_buff *skb); extern int ip_push_pending_frames(struct sock *sk, struct flowi4 *fl4); extern void ip_flush_pending_frames(struct sock *sk); extern struct sk_buff *ip_make_skb(struct sock *sk, diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index ec410e08b4b..147ccc3e93d 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1366,9 +1366,8 @@ out: return skb; } -int ip_send_skb(struct sk_buff *skb) +int ip_send_skb(struct net *net, struct sk_buff *skb) { - struct net *net = sock_net(skb->sk); int err; err = ip_local_out(skb); @@ -1391,7 +1390,7 @@ int ip_push_pending_frames(struct sock *sk, struct flowi4 *fl4) return 0; /* Netfilter gets whole the not fragmented skb. */ - return ip_send_skb(skb); + return ip_send_skb(sock_net(sk), skb); } /* diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index b4c3582a991..6f6d1aca3c3 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -758,7 +758,7 @@ static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4) uh->check = CSUM_MANGLED_0; send: - err = ip_send_skb(skb); + err = ip_send_skb(sock_net(sk), skb); if (err) { if (err == -ENOBUFS && !inet->recverr) { UDP_INC_STATS_USER(sock_net(sk), -- cgit v1.2.3-70-g09d2 From 2359a47671fc4fb0fe5e9945f76c2cb10792c0f8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 29 Jul 2012 20:52:21 +0000 Subject: codel: refine one condition to avoid a nul rec_inv_sqrt One condition before codel_Newton_step() was not good if we never left the dropping state for a flow. As a result rec_inv_sqrt was 0, instead of the ~0 initial value. codel control law was then set to a very aggressive mode, dropping many packets before reaching 'target' and recovering from this problem. To keep codel_vars_init() as efficient as possible, refine the condition to make sure rec_inv_sqrt initial value is correct Many thanks to Anton Mich for discovering the issue and suggesting a fix. Reported-by: Anton Mich Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/codel.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/codel.h b/include/net/codel.h index 550debfc240..389cf621161 100644 --- a/include/net/codel.h +++ b/include/net/codel.h @@ -305,6 +305,8 @@ static struct sk_buff *codel_dequeue(struct Qdisc *sch, } } } else if (drop) { + u32 delta; + if (params->ecn && INET_ECN_set_ce(skb)) { stats->ecn_mark++; } else { @@ -320,9 +322,11 @@ static struct sk_buff *codel_dequeue(struct Qdisc *sch, * assume that the drop rate that controlled the queue on the * last cycle is a good starting point to control it now. */ - if (codel_time_before(now - vars->drop_next, + delta = vars->count - vars->lastcount; + if (delta > 1 && + codel_time_before(now - vars->drop_next, 16 * params->interval)) { - vars->count = (vars->count - vars->lastcount) | 1; + vars->count = delta; /* we dont care if rec_inv_sqrt approximation * is not very precise : * Next Newton steps will correct it quadratically. -- cgit v1.2.3-70-g09d2 From 2f292004dd1fb005788dc0a9cdd5559812ed866e Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Mon, 6 Aug 2012 10:03:59 -0400 Subject: drm/radeon: add some new SI pci ids Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- include/drm/drm_pciids.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/drm/drm_pciids.h b/include/drm/drm_pciids.h index 7ff5c99b163..c78bb997e2c 100644 --- a/include/drm/drm_pciids.h +++ b/include/drm/drm_pciids.h @@ -213,9 +213,12 @@ {0x1002, 0x6800, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_PITCAIRN|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP}, \ {0x1002, 0x6801, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_PITCAIRN|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP}, \ {0x1002, 0x6802, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_PITCAIRN|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP}, \ + {0x1002, 0x6806, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_PITCAIRN|RADEON_NEW_MEMMAP}, \ {0x1002, 0x6808, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_PITCAIRN|RADEON_NEW_MEMMAP}, \ {0x1002, 0x6809, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_PITCAIRN|RADEON_NEW_MEMMAP}, \ {0x1002, 0x6810, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_PITCAIRN|RADEON_NEW_MEMMAP}, \ + {0x1002, 0x6816, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_PITCAIRN|RADEON_NEW_MEMMAP}, \ + {0x1002, 0x6817, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_PITCAIRN|RADEON_NEW_MEMMAP}, \ {0x1002, 0x6818, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_PITCAIRN|RADEON_NEW_MEMMAP}, \ {0x1002, 0x6819, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_PITCAIRN|RADEON_NEW_MEMMAP}, \ {0x1002, 0x6820, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_VERDE|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP}, \ -- cgit v1.2.3-70-g09d2 From 6759a0a7a0496dbbd4fb062c6a76d61c55d0fbd9 Mon Sep 17 00:00:00 2001 From: Marek Olšák Date: Thu, 9 Aug 2012 16:34:17 +0200 Subject: drm/radeon/kms: implement timestamp userspace query (v2) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Returns a snapshot of the GPU clock counter. Needed for certain OpenGL extensions. v2: agd5f - address Jerome's comments - add function documentation Signed-off-by: Marek Olšák Reviewed-by: Jerome Glisse Signed-off-by: Alex Deucher --- drivers/gpu/drm/radeon/r600.c | 20 +++++++++++++++++++ drivers/gpu/drm/radeon/r600d.h | 3 +++ drivers/gpu/drm/radeon/radeon.h | 1 + drivers/gpu/drm/radeon/radeon_asic.h | 2 ++ drivers/gpu/drm/radeon/radeon_device.c | 1 + drivers/gpu/drm/radeon/radeon_drv.c | 3 ++- drivers/gpu/drm/radeon/radeon_kms.c | 35 ++++++++++++++++++++++++++++------ drivers/gpu/drm/radeon/si.c | 19 ++++++++++++++++++ drivers/gpu/drm/radeon/sid.h | 3 +++ include/drm/radeon_drm.h | 2 ++ 10 files changed, 82 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/drivers/gpu/drm/radeon/r600.c b/drivers/gpu/drm/radeon/r600.c index 637280f541a..d79c639ae73 100644 --- a/drivers/gpu/drm/radeon/r600.c +++ b/drivers/gpu/drm/radeon/r600.c @@ -3789,3 +3789,23 @@ static void r600_pcie_gen2_enable(struct radeon_device *rdev) WREG32_PCIE_P(PCIE_LC_LINK_WIDTH_CNTL, link_width_cntl); } } + +/** + * r600_get_gpu_clock - return GPU clock counter snapshot + * + * @rdev: radeon_device pointer + * + * Fetches a GPU clock counter snapshot (R6xx-cayman). + * Returns the 64 bit clock counter snapshot. + */ +uint64_t r600_get_gpu_clock(struct radeon_device *rdev) +{ + uint64_t clock; + + mutex_lock(&rdev->gpu_clock_mutex); + WREG32(RLC_CAPTURE_GPU_CLOCK_COUNT, 1); + clock = (uint64_t)RREG32(RLC_GPU_CLOCK_COUNT_LSB) | + ((uint64_t)RREG32(RLC_GPU_CLOCK_COUNT_MSB) << 32ULL); + mutex_unlock(&rdev->gpu_clock_mutex); + return clock; +} diff --git a/drivers/gpu/drm/radeon/r600d.h b/drivers/gpu/drm/radeon/r600d.h index 4b116ae75fc..fd328f4c3ea 100644 --- a/drivers/gpu/drm/radeon/r600d.h +++ b/drivers/gpu/drm/radeon/r600d.h @@ -602,6 +602,9 @@ #define RLC_HB_WPTR 0x3f1c #define RLC_HB_WPTR_LSB_ADDR 0x3f14 #define RLC_HB_WPTR_MSB_ADDR 0x3f18 +#define RLC_GPU_CLOCK_COUNT_LSB 0x3f38 +#define RLC_GPU_CLOCK_COUNT_MSB 0x3f3c +#define RLC_CAPTURE_GPU_CLOCK_COUNT 0x3f40 #define RLC_MC_CNTL 0x3f44 #define RLC_UCODE_CNTL 0x3f48 #define RLC_UCODE_ADDR 0x3f2c diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h index b237a29142d..99304194a65 100644 --- a/drivers/gpu/drm/radeon/radeon.h +++ b/drivers/gpu/drm/radeon/radeon.h @@ -1534,6 +1534,7 @@ struct radeon_device { unsigned debugfs_count; /* virtual memory */ struct radeon_vm_manager vm_manager; + struct mutex gpu_clock_mutex; }; int radeon_device_init(struct radeon_device *rdev, diff --git a/drivers/gpu/drm/radeon/radeon_asic.h b/drivers/gpu/drm/radeon/radeon_asic.h index 0d445e7d00d..18c38d14c8c 100644 --- a/drivers/gpu/drm/radeon/radeon_asic.h +++ b/drivers/gpu/drm/radeon/radeon_asic.h @@ -368,6 +368,7 @@ void r600_kms_blit_copy(struct radeon_device *rdev, unsigned num_gpu_pages, struct radeon_sa_bo *vb); int r600_mc_wait_for_idle(struct radeon_device *rdev); +uint64_t r600_get_gpu_clock(struct radeon_device *rdev); /* * rv770,rv730,rv710,rv740 @@ -468,5 +469,6 @@ int si_vm_bind(struct radeon_device *rdev, struct radeon_vm *vm, int id); void si_vm_unbind(struct radeon_device *rdev, struct radeon_vm *vm); void si_vm_tlb_flush(struct radeon_device *rdev, struct radeon_vm *vm); int si_ib_parse(struct radeon_device *rdev, struct radeon_ib *ib); +uint64_t si_get_gpu_clock(struct radeon_device *rdev); #endif diff --git a/drivers/gpu/drm/radeon/radeon_device.c b/drivers/gpu/drm/radeon/radeon_device.c index 742af8244e8..d2e243867ac 100644 --- a/drivers/gpu/drm/radeon/radeon_device.c +++ b/drivers/gpu/drm/radeon/radeon_device.c @@ -1009,6 +1009,7 @@ int radeon_device_init(struct radeon_device *rdev, atomic_set(&rdev->ih.lock, 0); mutex_init(&rdev->gem.mutex); mutex_init(&rdev->pm.mutex); + mutex_init(&rdev->gpu_clock_mutex); init_rwsem(&rdev->pm.mclk_lock); init_rwsem(&rdev->exclusive_lock); init_waitqueue_head(&rdev->irq.vblank_queue); diff --git a/drivers/gpu/drm/radeon/radeon_drv.c b/drivers/gpu/drm/radeon/radeon_drv.c index a7f8ac0d8f0..d7269f48d37 100644 --- a/drivers/gpu/drm/radeon/radeon_drv.c +++ b/drivers/gpu/drm/radeon/radeon_drv.c @@ -61,9 +61,10 @@ * 2.17.0 - add STRMOUT_BASE_UPDATE for r7xx * 2.18.0 - r600-eg: allow "invalid" DB formats * 2.19.0 - r600-eg: MSAA textures + * 2.20.0 - r600-si: RADEON_INFO_TIMESTAMP query */ #define KMS_DRIVER_MAJOR 2 -#define KMS_DRIVER_MINOR 19 +#define KMS_DRIVER_MINOR 20 #define KMS_DRIVER_PATCHLEVEL 0 int radeon_driver_load_kms(struct drm_device *dev, unsigned long flags); int radeon_driver_unload_kms(struct drm_device *dev); diff --git a/drivers/gpu/drm/radeon/radeon_kms.c b/drivers/gpu/drm/radeon/radeon_kms.c index 1d73f16b5d9..414b4acf694 100644 --- a/drivers/gpu/drm/radeon/radeon_kms.c +++ b/drivers/gpu/drm/radeon/radeon_kms.c @@ -29,6 +29,7 @@ #include "drm_sarea.h" #include "radeon.h" #include "radeon_drm.h" +#include "radeon_asic.h" #include #include @@ -167,17 +168,39 @@ static void radeon_set_filp_rights(struct drm_device *dev, int radeon_info_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) { struct radeon_device *rdev = dev->dev_private; - struct drm_radeon_info *info; + struct drm_radeon_info *info = data; struct radeon_mode_info *minfo = &rdev->mode_info; - uint32_t *value_ptr; - uint32_t value; + uint32_t value, *value_ptr; + uint64_t value64, *value_ptr64; struct drm_crtc *crtc; int i, found; - info = data; + /* TIMESTAMP is a 64-bit value, needs special handling. */ + if (info->request == RADEON_INFO_TIMESTAMP) { + if (rdev->family >= CHIP_R600) { + value_ptr64 = (uint64_t*)((unsigned long)info->value); + if (rdev->family >= CHIP_TAHITI) { + value64 = si_get_gpu_clock(rdev); + } else { + value64 = r600_get_gpu_clock(rdev); + } + + if (DRM_COPY_TO_USER(value_ptr64, &value64, sizeof(value64))) { + DRM_ERROR("copy_to_user %s:%u\n", __func__, __LINE__); + return -EFAULT; + } + return 0; + } else { + DRM_DEBUG_KMS("timestamp is r6xx+ only!\n"); + return -EINVAL; + } + } + value_ptr = (uint32_t *)((unsigned long)info->value); - if (DRM_COPY_FROM_USER(&value, value_ptr, sizeof(value))) + if (DRM_COPY_FROM_USER(&value, value_ptr, sizeof(value))) { + DRM_ERROR("copy_from_user %s:%u\n", __func__, __LINE__); return -EFAULT; + } switch (info->request) { case RADEON_INFO_DEVICE_ID: @@ -337,7 +360,7 @@ int radeon_info_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) return -EINVAL; } if (DRM_COPY_TO_USER(value_ptr, &value, sizeof(uint32_t))) { - DRM_ERROR("copy_to_user\n"); + DRM_ERROR("copy_to_user %s:%u\n", __func__, __LINE__); return -EFAULT; } return 0; diff --git a/drivers/gpu/drm/radeon/si.c b/drivers/gpu/drm/radeon/si.c index c153a7f359c..0139e227e3c 100644 --- a/drivers/gpu/drm/radeon/si.c +++ b/drivers/gpu/drm/radeon/si.c @@ -3968,3 +3968,22 @@ void si_fini(struct radeon_device *rdev) rdev->bios = NULL; } +/** + * si_get_gpu_clock - return GPU clock counter snapshot + * + * @rdev: radeon_device pointer + * + * Fetches a GPU clock counter snapshot (SI). + * Returns the 64 bit clock counter snapshot. + */ +uint64_t si_get_gpu_clock(struct radeon_device *rdev) +{ + uint64_t clock; + + mutex_lock(&rdev->gpu_clock_mutex); + WREG32(RLC_CAPTURE_GPU_CLOCK_COUNT, 1); + clock = (uint64_t)RREG32(RLC_GPU_CLOCK_COUNT_LSB) | + ((uint64_t)RREG32(RLC_GPU_CLOCK_COUNT_MSB) << 32ULL); + mutex_unlock(&rdev->gpu_clock_mutex); + return clock; +} diff --git a/drivers/gpu/drm/radeon/sid.h b/drivers/gpu/drm/radeon/sid.h index 7869089e876..ef4815c27b1 100644 --- a/drivers/gpu/drm/radeon/sid.h +++ b/drivers/gpu/drm/radeon/sid.h @@ -698,6 +698,9 @@ #define RLC_UCODE_ADDR 0xC32C #define RLC_UCODE_DATA 0xC330 +#define RLC_GPU_CLOCK_COUNT_LSB 0xC338 +#define RLC_GPU_CLOCK_COUNT_MSB 0xC33C +#define RLC_CAPTURE_GPU_CLOCK_COUNT 0xC340 #define RLC_MC_CNTL 0xC344 #define RLC_UCODE_CNTL 0xC348 diff --git a/include/drm/radeon_drm.h b/include/drm/radeon_drm.h index 58056865b8e..dc3a8cd7db8 100644 --- a/include/drm/radeon_drm.h +++ b/include/drm/radeon_drm.h @@ -964,6 +964,8 @@ struct drm_radeon_cs { #define RADEON_INFO_IB_VM_MAX_SIZE 0x0f /* max pipes - needed for compute shaders */ #define RADEON_INFO_MAX_PIPES 0x10 +/* timestamp for GL_ARB_timer_query (OpenGL), returns the current GPU clock */ +#define RADEON_INFO_TIMESTAMP 0x11 struct drm_radeon_info { uint32_t request; -- cgit v1.2.3-70-g09d2 From 0bce9c46bf3b15f485d82d7e81dabed6ebcc24b1 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 10 Aug 2012 15:22:09 +0100 Subject: mutex: Place lock in contended state after fastpath_lock failure ARM recently moved to asm-generic/mutex-xchg.h for its mutex implementation after the previous implementation was found to be missing some crucial memory barriers. However, this has revealed some problems running hackbench on SMP platforms due to the way in which the MUTEX_SPIN_ON_OWNER code operates. The symptoms are that a bunch of hackbench tasks are left waiting on an unlocked mutex and therefore never get woken up to claim it. This boils down to the following sequence of events: Task A Task B Task C Lock value 0 1 1 lock() 0 2 lock() 0 3 spin(A) 0 4 unlock() 1 5 lock() 0 6 cmpxchg(1,0) 0 7 contended() -1 8 lock() 0 9 spin(C) 0 10 unlock() 1 11 cmpxchg(1,0) 0 12 unlock() 1 At this point, the lock is unlocked, but Task B is in an uninterruptible sleep with nobody to wake it up. This patch fixes the problem by ensuring we put the lock into the contended state if we fail to acquire it on the fastpath, ensuring that any blocked waiters are woken up when the mutex is released. Signed-off-by: Will Deacon Cc: Arnd Bergmann Cc: Chris Mason Cc: Ingo Molnar Cc: Reviewed-by: Nicolas Pitre Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-6e9lrw2avczr0617fzl5vqb8@git.kernel.org Signed-off-by: Thomas Gleixner --- include/asm-generic/mutex-xchg.h | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/asm-generic/mutex-xchg.h b/include/asm-generic/mutex-xchg.h index 580a6d35c70..c04e0db8a2d 100644 --- a/include/asm-generic/mutex-xchg.h +++ b/include/asm-generic/mutex-xchg.h @@ -26,7 +26,13 @@ static inline void __mutex_fastpath_lock(atomic_t *count, void (*fail_fn)(atomic_t *)) { if (unlikely(atomic_xchg(count, 0) != 1)) - fail_fn(count); + /* + * We failed to acquire the lock, so mark it contended + * to ensure that any waiting tasks are woken up by the + * unlock slow path. + */ + if (likely(atomic_xchg(count, -1) != 1)) + fail_fn(count); } /** @@ -43,7 +49,8 @@ static inline int __mutex_fastpath_lock_retval(atomic_t *count, int (*fail_fn)(atomic_t *)) { if (unlikely(atomic_xchg(count, 0) != 1)) - return fail_fn(count); + if (likely(atomic_xchg(count, -1) != 1)) + return fail_fn(count); return 0; } -- cgit v1.2.3-70-g09d2 From f026cfa82f628db24b8cea41b9d6202af104cecb Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 14 Aug 2012 09:53:38 -0700 Subject: Revert "x86-64/efi: Use EFI to deal with platform wall clock" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit bacef661acdb634170a8faddbc1cf28e8f8b9eee. This commit has been found to cause serious regressions on a number of ASUS machines at the least. We probably need to provide a 1:1 map in addition to the EFI virtual memory map in order for this to work. Signed-off-by: H. Peter Anvin Reported-and-bisected-by: Jérôme Carretero Cc: Jan Beulich Cc: Matt Fleming Cc: Matthew Garrett Cc: Linus Torvalds Cc: Andrew Morton Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20120805172903.5f8bb24c@zougloub.eu --- arch/x86/mm/pageattr.c | 10 ++++------ arch/x86/platform/efi/efi.c | 30 ++++++++++++++++++++++++++---- include/linux/efi.h | 2 ++ init/main.c | 8 ++++---- 4 files changed, 36 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 931930a9616..a718e0d2350 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -919,13 +919,11 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages, /* * On success we use clflush, when the CPU supports it to - * avoid the wbindv. If the CPU does not support it, in the - * error case, and during early boot (for EFI) we fall back - * to cpa_flush_all (which uses wbinvd): + * avoid the wbindv. If the CPU does not support it and in the + * error case we fall back to cpa_flush_all (which uses + * wbindv): */ - if (early_boot_irqs_disabled) - __cpa_flush_all((void *)(long)cache); - else if (!ret && cpu_has_clflush) { + if (!ret && cpu_has_clflush) { if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) { cpa_flush_array(addr, numpages, cache, cpa.flags, pages); diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 2dc29f51e75..92660edaa1e 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -234,7 +234,22 @@ static efi_status_t __init phys_efi_set_virtual_address_map( return status; } -static int efi_set_rtc_mmss(unsigned long nowtime) +static efi_status_t __init phys_efi_get_time(efi_time_t *tm, + efi_time_cap_t *tc) +{ + unsigned long flags; + efi_status_t status; + + spin_lock_irqsave(&rtc_lock, flags); + efi_call_phys_prelog(); + status = efi_call_phys2(efi_phys.get_time, virt_to_phys(tm), + virt_to_phys(tc)); + efi_call_phys_epilog(); + spin_unlock_irqrestore(&rtc_lock, flags); + return status; +} + +int efi_set_rtc_mmss(unsigned long nowtime) { int real_seconds, real_minutes; efi_status_t status; @@ -263,7 +278,7 @@ static int efi_set_rtc_mmss(unsigned long nowtime) return 0; } -static unsigned long efi_get_time(void) +unsigned long efi_get_time(void) { efi_status_t status; efi_time_t eft; @@ -606,13 +621,18 @@ static int __init efi_runtime_init(void) } /* * We will only need *early* access to the following - * EFI runtime service before set_virtual_address_map + * two EFI runtime services before set_virtual_address_map * is invoked. */ + efi_phys.get_time = (efi_get_time_t *)runtime->get_time; efi_phys.set_virtual_address_map = (efi_set_virtual_address_map_t *) runtime->set_virtual_address_map; - + /* + * Make efi_get_time can be called before entering + * virtual mode. + */ + efi.get_time = phys_efi_get_time; early_iounmap(runtime, sizeof(efi_runtime_services_t)); return 0; @@ -700,10 +720,12 @@ void __init efi_init(void) efi_enabled = 0; return; } +#ifdef CONFIG_X86_32 if (efi_native) { x86_platform.get_wallclock = efi_get_time; x86_platform.set_wallclock = efi_set_rtc_mmss; } +#endif #if EFI_DEBUG print_efi_memmap(); diff --git a/include/linux/efi.h b/include/linux/efi.h index 103adc6d7e3..ec45ccd8708 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -503,6 +503,8 @@ extern u64 efi_mem_attribute (unsigned long phys_addr, unsigned long size); extern int __init efi_uart_console_only (void); extern void efi_initialize_iomem_resources(struct resource *code_resource, struct resource *data_resource, struct resource *bss_resource); +extern unsigned long efi_get_time(void); +extern int efi_set_rtc_mmss(unsigned long nowtime); extern void efi_reserve_boot_services(void); extern struct efi_memory_map memmap; diff --git a/init/main.c b/init/main.c index e60679de61c..b28673087ac 100644 --- a/init/main.c +++ b/init/main.c @@ -461,10 +461,6 @@ static void __init mm_init(void) percpu_init_late(); pgtable_cache_init(); vmalloc_init(); -#ifdef CONFIG_X86 - if (efi_enabled) - efi_enter_virtual_mode(); -#endif } asmlinkage void __init start_kernel(void) @@ -606,6 +602,10 @@ asmlinkage void __init start_kernel(void) calibrate_delay(); pidmap_init(); anon_vma_init(); +#ifdef CONFIG_X86 + if (efi_enabled) + efi_enter_virtual_mode(); +#endif thread_info_cache_init(); cred_init(); fork_init(totalram_pages); -- cgit v1.2.3-70-g09d2 From 47be03a28cc6c80e3aa2b3e8ed6d960ff0c5c0af Mon Sep 17 00:00:00 2001 From: Amerigo Wang Date: Fri, 10 Aug 2012 01:24:37 +0000 Subject: netpoll: use GFP_ATOMIC in slave_enable_netpoll() and __netpoll_setup() slave_enable_netpoll() and __netpoll_setup() may be called with read_lock() held, so should use GFP_ATOMIC to allocate memory. Eric suggested to pass gfp flags to __netpoll_setup(). Cc: Eric Dumazet Cc: "David S. Miller" Reported-by: Dan Carpenter Signed-off-by: Eric Dumazet Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- drivers/net/bonding/bond_main.c | 6 +++--- drivers/net/team/team.c | 16 +++++++++------- include/linux/netdevice.h | 3 ++- include/linux/netpoll.h | 2 +- net/8021q/vlan_dev.c | 7 ++++--- net/bridge/br_device.c | 12 ++++++------ net/bridge/br_if.c | 2 +- net/bridge/br_private.h | 4 ++-- net/core/netpoll.c | 8 ++++---- 9 files changed, 32 insertions(+), 28 deletions(-) (limited to 'include') diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 6fae5f3ec7f..8697136e27c 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -1235,12 +1235,12 @@ static inline int slave_enable_netpoll(struct slave *slave) struct netpoll *np; int err = 0; - np = kzalloc(sizeof(*np), GFP_KERNEL); + np = kzalloc(sizeof(*np), GFP_ATOMIC); err = -ENOMEM; if (!np) goto out; - err = __netpoll_setup(np, slave->dev); + err = __netpoll_setup(np, slave->dev, GFP_ATOMIC); if (err) { kfree(np); goto out; @@ -1292,7 +1292,7 @@ static void bond_netpoll_cleanup(struct net_device *bond_dev) read_unlock(&bond->lock); } -static int bond_netpoll_setup(struct net_device *dev, struct netpoll_info *ni) +static int bond_netpoll_setup(struct net_device *dev, struct netpoll_info *ni, gfp_t gfp) { struct bonding *bond = netdev_priv(dev); struct slave *slave; diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c index 87707ab3943..341b65dbbcd 100644 --- a/drivers/net/team/team.c +++ b/drivers/net/team/team.c @@ -795,16 +795,17 @@ static void team_port_leave(struct team *team, struct team_port *port) } #ifdef CONFIG_NET_POLL_CONTROLLER -static int team_port_enable_netpoll(struct team *team, struct team_port *port) +static int team_port_enable_netpoll(struct team *team, struct team_port *port, + gfp_t gfp) { struct netpoll *np; int err; - np = kzalloc(sizeof(*np), GFP_KERNEL); + np = kzalloc(sizeof(*np), gfp); if (!np) return -ENOMEM; - err = __netpoll_setup(np, port->dev); + err = __netpoll_setup(np, port->dev, gfp); if (err) { kfree(np); return err; @@ -833,7 +834,8 @@ static struct netpoll_info *team_netpoll_info(struct team *team) } #else -static int team_port_enable_netpoll(struct team *team, struct team_port *port) +static int team_port_enable_netpoll(struct team *team, struct team_port *port, + gfp_t gfp) { return 0; } @@ -913,7 +915,7 @@ static int team_port_add(struct team *team, struct net_device *port_dev) } if (team_netpoll_info(team)) { - err = team_port_enable_netpoll(team, port); + err = team_port_enable_netpoll(team, port, GFP_KERNEL); if (err) { netdev_err(dev, "Failed to enable netpoll on device %s\n", portname); @@ -1443,7 +1445,7 @@ static void team_netpoll_cleanup(struct net_device *dev) } static int team_netpoll_setup(struct net_device *dev, - struct netpoll_info *npifo) + struct netpoll_info *npifo, gfp_t gfp) { struct team *team = netdev_priv(dev); struct team_port *port; @@ -1451,7 +1453,7 @@ static int team_netpoll_setup(struct net_device *dev, mutex_lock(&team->lock); list_for_each_entry(port, &team->port_list, list) { - err = team_port_enable_netpoll(team, port); + err = team_port_enable_netpoll(team, port, gfp); if (err) { __team_netpoll_cleanup(team); break; diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index a9db4f33407..3560d688161 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -953,7 +953,8 @@ struct net_device_ops { #ifdef CONFIG_NET_POLL_CONTROLLER void (*ndo_poll_controller)(struct net_device *dev); int (*ndo_netpoll_setup)(struct net_device *dev, - struct netpoll_info *info); + struct netpoll_info *info, + gfp_t gfp); void (*ndo_netpoll_cleanup)(struct net_device *dev); #endif int (*ndo_set_vf_mac)(struct net_device *dev, diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h index 28f5389c924..bf2d51eec0f 100644 --- a/include/linux/netpoll.h +++ b/include/linux/netpoll.h @@ -43,7 +43,7 @@ struct netpoll_info { void netpoll_send_udp(struct netpoll *np, const char *msg, int len); void netpoll_print_options(struct netpoll *np); int netpoll_parse_options(struct netpoll *np, char *opt); -int __netpoll_setup(struct netpoll *np, struct net_device *ndev); +int __netpoll_setup(struct netpoll *np, struct net_device *ndev, gfp_t gfp); int netpoll_setup(struct netpoll *np); int netpoll_trap(void); void netpoll_set_trap(int trap); diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index 73a2a83ee2d..ee4ae0944ce 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -669,19 +669,20 @@ static void vlan_dev_poll_controller(struct net_device *dev) return; } -static int vlan_dev_netpoll_setup(struct net_device *dev, struct netpoll_info *npinfo) +static int vlan_dev_netpoll_setup(struct net_device *dev, struct netpoll_info *npinfo, + gfp_t gfp) { struct vlan_dev_priv *info = vlan_dev_priv(dev); struct net_device *real_dev = info->real_dev; struct netpoll *netpoll; int err = 0; - netpoll = kzalloc(sizeof(*netpoll), GFP_KERNEL); + netpoll = kzalloc(sizeof(*netpoll), gfp); err = -ENOMEM; if (!netpoll) goto out; - err = __netpoll_setup(netpoll, real_dev); + err = __netpoll_setup(netpoll, real_dev, gfp); if (err) { kfree(netpoll); goto out; diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index 33348453760..ed0e0f9dc78 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -213,7 +213,8 @@ static void br_netpoll_cleanup(struct net_device *dev) } } -static int br_netpoll_setup(struct net_device *dev, struct netpoll_info *ni) +static int br_netpoll_setup(struct net_device *dev, struct netpoll_info *ni, + gfp_t gfp) { struct net_bridge *br = netdev_priv(dev); struct net_bridge_port *p, *n; @@ -222,8 +223,7 @@ static int br_netpoll_setup(struct net_device *dev, struct netpoll_info *ni) list_for_each_entry_safe(p, n, &br->port_list, list) { if (!p->dev) continue; - - err = br_netpoll_enable(p); + err = br_netpoll_enable(p, gfp); if (err) goto fail; } @@ -236,17 +236,17 @@ fail: goto out; } -int br_netpoll_enable(struct net_bridge_port *p) +int br_netpoll_enable(struct net_bridge_port *p, gfp_t gfp) { struct netpoll *np; int err = 0; - np = kzalloc(sizeof(*p->np), GFP_KERNEL); + np = kzalloc(sizeof(*p->np), gfp); err = -ENOMEM; if (!np) goto out; - err = __netpoll_setup(np, p->dev); + err = __netpoll_setup(np, p->dev, gfp); if (err) { kfree(np); goto out; diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index e1144e1617b..171fd6b9bfe 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -361,7 +361,7 @@ int br_add_if(struct net_bridge *br, struct net_device *dev) if (err) goto err2; - if (br_netpoll_info(br) && ((err = br_netpoll_enable(p)))) + if (br_netpoll_info(br) && ((err = br_netpoll_enable(p, GFP_KERNEL)))) goto err3; err = netdev_set_master(dev, br->dev); diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index a768b2408ed..f507d2af964 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -316,7 +316,7 @@ static inline void br_netpoll_send_skb(const struct net_bridge_port *p, netpoll_send_skb(np, skb); } -extern int br_netpoll_enable(struct net_bridge_port *p); +extern int br_netpoll_enable(struct net_bridge_port *p, gfp_t gfp); extern void br_netpoll_disable(struct net_bridge_port *p); #else static inline struct netpoll_info *br_netpoll_info(struct net_bridge *br) @@ -329,7 +329,7 @@ static inline void br_netpoll_send_skb(const struct net_bridge_port *p, { } -static inline int br_netpoll_enable(struct net_bridge_port *p) +static inline int br_netpoll_enable(struct net_bridge_port *p, gfp_t gfp) { return 0; } diff --git a/net/core/netpoll.c b/net/core/netpoll.c index b4c90e42b44..37cc854774a 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -715,7 +715,7 @@ int netpoll_parse_options(struct netpoll *np, char *opt) } EXPORT_SYMBOL(netpoll_parse_options); -int __netpoll_setup(struct netpoll *np, struct net_device *ndev) +int __netpoll_setup(struct netpoll *np, struct net_device *ndev, gfp_t gfp) { struct netpoll_info *npinfo; const struct net_device_ops *ops; @@ -734,7 +734,7 @@ int __netpoll_setup(struct netpoll *np, struct net_device *ndev) } if (!ndev->npinfo) { - npinfo = kmalloc(sizeof(*npinfo), GFP_KERNEL); + npinfo = kmalloc(sizeof(*npinfo), gfp); if (!npinfo) { err = -ENOMEM; goto out; @@ -752,7 +752,7 @@ int __netpoll_setup(struct netpoll *np, struct net_device *ndev) ops = np->dev->netdev_ops; if (ops->ndo_netpoll_setup) { - err = ops->ndo_netpoll_setup(ndev, npinfo); + err = ops->ndo_netpoll_setup(ndev, npinfo, gfp); if (err) goto free_npinfo; } @@ -857,7 +857,7 @@ int netpoll_setup(struct netpoll *np) refill_skbs(); rtnl_lock(); - err = __netpoll_setup(np, ndev); + err = __netpoll_setup(np, ndev, GFP_KERNEL); rtnl_unlock(); if (err) -- cgit v1.2.3-70-g09d2 From 38e6bc185d9544dfad1774b3f8902a0b061aea25 Mon Sep 17 00:00:00 2001 From: Amerigo Wang Date: Fri, 10 Aug 2012 01:24:38 +0000 Subject: netpoll: make __netpoll_cleanup non-block Like the previous patch, slave_disable_netpoll() and __netpoll_cleanup() may be called with read_lock() held too, so we should make them non-block, by moving the cleanup and kfree() to call_rcu_bh() callbacks. Cc: "David S. Miller" Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- drivers/net/bonding/bond_main.c | 4 +--- include/linux/netpoll.h | 3 +++ net/8021q/vlan_dev.c | 6 +----- net/bridge/br_device.c | 6 +----- net/core/netpoll.c | 42 +++++++++++++++++++++++++++++++---------- 5 files changed, 38 insertions(+), 23 deletions(-) (limited to 'include') diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 8697136e27c..e42891683e3 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -1257,9 +1257,7 @@ static inline void slave_disable_netpoll(struct slave *slave) return; slave->np = NULL; - synchronize_rcu_bh(); - __netpoll_cleanup(np); - kfree(np); + __netpoll_free_rcu(np); } static inline bool slave_dev_support_netpoll(struct net_device *slave_dev) { diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h index bf2d51eec0f..907812efb4d 100644 --- a/include/linux/netpoll.h +++ b/include/linux/netpoll.h @@ -23,6 +23,7 @@ struct netpoll { u8 remote_mac[ETH_ALEN]; struct list_head rx; /* rx_np list element */ + struct rcu_head rcu; }; struct netpoll_info { @@ -38,6 +39,7 @@ struct netpoll_info { struct delayed_work tx_work; struct netpoll *netpoll; + struct rcu_head rcu; }; void netpoll_send_udp(struct netpoll *np, const char *msg, int len); @@ -48,6 +50,7 @@ int netpoll_setup(struct netpoll *np); int netpoll_trap(void); void netpoll_set_trap(int trap); void __netpoll_cleanup(struct netpoll *np); +void __netpoll_free_rcu(struct netpoll *np); void netpoll_cleanup(struct netpoll *np); int __netpoll_rx(struct sk_buff *skb); void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb, diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index ee4ae0944ce..b65623f9066 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -704,11 +704,7 @@ static void vlan_dev_netpoll_cleanup(struct net_device *dev) info->netpoll = NULL; - /* Wait for transmitting packets to finish before freeing. */ - synchronize_rcu_bh(); - - __netpoll_cleanup(netpoll); - kfree(netpoll); + __netpoll_free_rcu(netpoll); } #endif /* CONFIG_NET_POLL_CONTROLLER */ diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index ed0e0f9dc78..f41ba4048c9 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -267,11 +267,7 @@ void br_netpoll_disable(struct net_bridge_port *p) p->np = NULL; - /* Wait for transmitting packets to finish before freeing. */ - synchronize_rcu_bh(); - - __netpoll_cleanup(np); - kfree(np); + __netpoll_free_rcu(np); } #endif diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 37cc854774a..dc17f1db147 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -878,6 +878,24 @@ static int __init netpoll_init(void) } core_initcall(netpoll_init); +static void rcu_cleanup_netpoll_info(struct rcu_head *rcu_head) +{ + struct netpoll_info *npinfo = + container_of(rcu_head, struct netpoll_info, rcu); + + skb_queue_purge(&npinfo->arp_tx); + skb_queue_purge(&npinfo->txq); + + /* we can't call cancel_delayed_work_sync here, as we are in softirq */ + cancel_delayed_work(&npinfo->tx_work); + + /* clean after last, unfinished work */ + __skb_queue_purge(&npinfo->txq); + /* now cancel it again */ + cancel_delayed_work(&npinfo->tx_work); + kfree(npinfo); +} + void __netpoll_cleanup(struct netpoll *np) { struct netpoll_info *npinfo; @@ -903,20 +921,24 @@ void __netpoll_cleanup(struct netpoll *np) ops->ndo_netpoll_cleanup(np->dev); RCU_INIT_POINTER(np->dev->npinfo, NULL); + call_rcu_bh(&npinfo->rcu, rcu_cleanup_netpoll_info); + } +} +EXPORT_SYMBOL_GPL(__netpoll_cleanup); - /* avoid racing with NAPI reading npinfo */ - synchronize_rcu_bh(); +static void rcu_cleanup_netpoll(struct rcu_head *rcu_head) +{ + struct netpoll *np = container_of(rcu_head, struct netpoll, rcu); - skb_queue_purge(&npinfo->arp_tx); - skb_queue_purge(&npinfo->txq); - cancel_delayed_work_sync(&npinfo->tx_work); + __netpoll_cleanup(np); + kfree(np); +} - /* clean after last, unfinished work */ - __skb_queue_purge(&npinfo->txq); - kfree(npinfo); - } +void __netpoll_free_rcu(struct netpoll *np) +{ + call_rcu_bh(&np->rcu, rcu_cleanup_netpoll); } -EXPORT_SYMBOL_GPL(__netpoll_cleanup); +EXPORT_SYMBOL_GPL(__netpoll_free_rcu); void netpoll_cleanup(struct netpoll *np) { -- cgit v1.2.3-70-g09d2 From 57c5d46191e75312934c00eba65b13a31ca95120 Mon Sep 17 00:00:00 2001 From: Amerigo Wang Date: Fri, 10 Aug 2012 01:24:40 +0000 Subject: netpoll: take rcu_read_lock_bh() in netpoll_rx() In __netpoll_rx(), it dereferences ->npinfo without rcu_dereference_bh(), this patch fixes it by using the 'npinfo' passed from netpoll_rx() where it is already dereferenced with rcu_dereference_bh(). Cc: "David S. Miller" Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- include/linux/netpoll.h | 4 ++-- net/core/netpoll.c | 3 +-- 2 files changed, 3 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h index 907812efb4d..5d881c38827 100644 --- a/include/linux/netpoll.h +++ b/include/linux/netpoll.h @@ -52,7 +52,7 @@ void netpoll_set_trap(int trap); void __netpoll_cleanup(struct netpoll *np); void __netpoll_free_rcu(struct netpoll *np); void netpoll_cleanup(struct netpoll *np); -int __netpoll_rx(struct sk_buff *skb); +int __netpoll_rx(struct sk_buff *skb, struct netpoll_info *npinfo); void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb, struct net_device *dev); static inline void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) @@ -77,7 +77,7 @@ static inline bool netpoll_rx(struct sk_buff *skb) spin_lock(&npinfo->rx_lock); /* check rx_flags again with the lock held */ - if (npinfo->rx_flags && __netpoll_rx(skb)) + if (npinfo->rx_flags && __netpoll_rx(skb, npinfo)) ret = true; spin_unlock(&npinfo->rx_lock); diff --git a/net/core/netpoll.c b/net/core/netpoll.c index dc17f1db147..d055bb01328 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -543,13 +543,12 @@ static void arp_reply(struct sk_buff *skb) spin_unlock_irqrestore(&npinfo->rx_lock, flags); } -int __netpoll_rx(struct sk_buff *skb) +int __netpoll_rx(struct sk_buff *skb, struct netpoll_info *npinfo) { int proto, len, ulen; int hits = 0; const struct iphdr *iph; struct udphdr *uh; - struct netpoll_info *npinfo = skb->dev->npinfo; struct netpoll *np, *tmp; if (list_empty(&npinfo->rx_np)) -- cgit v1.2.3-70-g09d2 From 91fe4a4b9e490a24f6702dd8afe72d8afab6fcdb Mon Sep 17 00:00:00 2001 From: Amerigo Wang Date: Fri, 10 Aug 2012 01:24:41 +0000 Subject: netpoll: use netpoll_rx_on() in netpoll_rx() The logic of the code is same, just call netpoll_rx_on(). Cc: "David S. Miller" Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- include/linux/netpoll.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h index 5d881c38827..2d178baa49d 100644 --- a/include/linux/netpoll.h +++ b/include/linux/netpoll.h @@ -63,6 +63,13 @@ static inline void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) #ifdef CONFIG_NETPOLL +static inline int netpoll_rx_on(struct sk_buff *skb) +{ + struct netpoll_info *npinfo = rcu_dereference_bh(skb->dev->npinfo); + + return npinfo && (!list_empty(&npinfo->rx_np) || npinfo->rx_flags); +} + static inline bool netpoll_rx(struct sk_buff *skb) { struct netpoll_info *npinfo; @@ -70,11 +77,11 @@ static inline bool netpoll_rx(struct sk_buff *skb) bool ret = false; local_irq_save(flags); - npinfo = rcu_dereference_bh(skb->dev->npinfo); - if (!npinfo || (list_empty(&npinfo->rx_np) && !npinfo->rx_flags)) + if (!netpoll_rx_on(skb)) goto out; + npinfo = rcu_dereference_bh(skb->dev->npinfo); spin_lock(&npinfo->rx_lock); /* check rx_flags again with the lock held */ if (npinfo->rx_flags && __netpoll_rx(skb, npinfo)) @@ -86,13 +93,6 @@ out: return ret; } -static inline int netpoll_rx_on(struct sk_buff *skb) -{ - struct netpoll_info *npinfo = rcu_dereference_bh(skb->dev->npinfo); - - return npinfo && (!list_empty(&npinfo->rx_np) || npinfo->rx_flags); -} - static inline int netpoll_receive_skb(struct sk_buff *skb) { if (!list_empty(&skb->dev->napi_list)) -- cgit v1.2.3-70-g09d2 From 2899656b494dcd118123af1126826b115c8ea6f9 Mon Sep 17 00:00:00 2001 From: Amerigo Wang Date: Fri, 10 Aug 2012 01:24:42 +0000 Subject: netpoll: take rcu_read_lock_bh() in netpoll_send_skb_on_dev() This patch fixes several problems in the call path of netpoll_send_skb_on_dev(): 1. Disable IRQ's before calling netpoll_send_skb_on_dev(). 2. All the callees of netpoll_send_skb_on_dev() should use rcu_dereference_bh() to dereference ->npinfo. 3. Rename arp_reply() to netpoll_arp_reply(), the former is too generic. Cc: "David S. Miller" Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- include/linux/netpoll.h | 3 +++ net/core/netpoll.c | 31 +++++++++++++++++-------------- 2 files changed, 20 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h index 2d178baa49d..61aee86cf21 100644 --- a/include/linux/netpoll.h +++ b/include/linux/netpoll.h @@ -57,7 +57,10 @@ void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb, struct net_device *dev); static inline void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) { + unsigned long flags; + local_irq_save(flags); netpoll_send_skb_on_dev(np, skb, np->dev); + local_irq_restore(flags); } diff --git a/net/core/netpoll.c b/net/core/netpoll.c index d055bb01328..174346ac15a 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -54,7 +54,7 @@ static atomic_t trapped; MAX_UDP_CHUNK) static void zap_completion_queue(void); -static void arp_reply(struct sk_buff *skb); +static void netpoll_arp_reply(struct sk_buff *skb, struct netpoll_info *npinfo); static unsigned int carrier_timeout = 4; module_param(carrier_timeout, uint, 0644); @@ -170,7 +170,8 @@ static void poll_napi(struct net_device *dev) list_for_each_entry(napi, &dev->napi_list, dev_list) { if (napi->poll_owner != smp_processor_id() && spin_trylock(&napi->poll_lock)) { - budget = poll_one_napi(dev->npinfo, napi, budget); + budget = poll_one_napi(rcu_dereference_bh(dev->npinfo), + napi, budget); spin_unlock(&napi->poll_lock); if (!budget) @@ -185,13 +186,14 @@ static void service_arp_queue(struct netpoll_info *npi) struct sk_buff *skb; while ((skb = skb_dequeue(&npi->arp_tx))) - arp_reply(skb); + netpoll_arp_reply(skb, npi); } } static void netpoll_poll_dev(struct net_device *dev) { const struct net_device_ops *ops; + struct netpoll_info *ni = rcu_dereference_bh(dev->npinfo); if (!dev || !netif_running(dev)) return; @@ -206,17 +208,18 @@ static void netpoll_poll_dev(struct net_device *dev) poll_napi(dev); if (dev->flags & IFF_SLAVE) { - if (dev->npinfo) { + if (ni) { struct net_device *bond_dev = dev->master; struct sk_buff *skb; - while ((skb = skb_dequeue(&dev->npinfo->arp_tx))) { + struct netpoll_info *bond_ni = rcu_dereference_bh(bond_dev->npinfo); + while ((skb = skb_dequeue(&ni->arp_tx))) { skb->dev = bond_dev; - skb_queue_tail(&bond_dev->npinfo->arp_tx, skb); + skb_queue_tail(&bond_ni->arp_tx, skb); } } } - service_arp_queue(dev->npinfo); + service_arp_queue(ni); zap_completion_queue(); } @@ -302,6 +305,7 @@ static int netpoll_owner_active(struct net_device *dev) return 0; } +/* call with IRQ disabled */ void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb, struct net_device *dev) { @@ -309,8 +313,11 @@ void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb, unsigned long tries; const struct net_device_ops *ops = dev->netdev_ops; /* It is up to the caller to keep npinfo alive. */ - struct netpoll_info *npinfo = np->dev->npinfo; + struct netpoll_info *npinfo; + + WARN_ON_ONCE(!irqs_disabled()); + npinfo = rcu_dereference_bh(np->dev->npinfo); if (!npinfo || !netif_running(dev) || !netif_device_present(dev)) { __kfree_skb(skb); return; @@ -319,11 +326,9 @@ void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb, /* don't get messages out of order, and no recursion */ if (skb_queue_len(&npinfo->txq) == 0 && !netpoll_owner_active(dev)) { struct netdev_queue *txq; - unsigned long flags; txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb)); - local_irq_save(flags); /* try until next clock tick */ for (tries = jiffies_to_usecs(1)/USEC_PER_POLL; tries > 0; --tries) { @@ -347,10 +352,9 @@ void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb, } WARN_ONCE(!irqs_disabled(), - "netpoll_send_skb(): %s enabled interrupts in poll (%pF)\n", + "netpoll_send_skb_on_dev(): %s enabled interrupts in poll (%pF)\n", dev->name, ops->ndo_start_xmit); - local_irq_restore(flags); } if (status != NETDEV_TX_OK) { @@ -423,9 +427,8 @@ void netpoll_send_udp(struct netpoll *np, const char *msg, int len) } EXPORT_SYMBOL(netpoll_send_udp); -static void arp_reply(struct sk_buff *skb) +static void netpoll_arp_reply(struct sk_buff *skb, struct netpoll_info *npinfo) { - struct netpoll_info *npinfo = skb->dev->npinfo; struct arphdr *arp; unsigned char *arp_ptr; int size, type = ARPOP_REPLY, ptype = ETH_P_ARP; -- cgit v1.2.3-70-g09d2 From e15c3c2294605f09f9b336b2f3b97086ab4b8145 Mon Sep 17 00:00:00 2001 From: Amerigo Wang Date: Fri, 10 Aug 2012 01:24:45 +0000 Subject: netpoll: check netpoll tx status on the right device Although this doesn't matter actually, because netpoll_tx_running() doesn't use the parameter, the code will be more readable. For team_dev_queue_xmit() we have to move it down to avoid compile errors. Cc: David Miller Signed-off-by: Jiri Pirko Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- drivers/net/bonding/bond_main.c | 2 +- include/linux/if_team.h | 30 +++++++++++++++--------------- net/bridge/br_forward.c | 2 +- 3 files changed, 17 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index e42891683e3..d688a8af432 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -398,7 +398,7 @@ int bond_dev_queue_xmit(struct bonding *bond, struct sk_buff *skb, sizeof(qdisc_skb_cb(skb)->slave_dev_queue_mapping)); skb->queue_mapping = qdisc_skb_cb(skb)->slave_dev_queue_mapping; - if (unlikely(netpoll_tx_running(slave_dev))) + if (unlikely(netpoll_tx_running(bond->dev))) bond_netpoll_send_skb(bond_get_slave_by_dev(bond, slave_dev), skb); else dev_queue_xmit(skb); diff --git a/include/linux/if_team.h b/include/linux/if_team.h index 6960fc1841a..aa2e167e1ef 100644 --- a/include/linux/if_team.h +++ b/include/linux/if_team.h @@ -96,21 +96,6 @@ static inline void team_netpoll_send_skb(struct team_port *port, } #endif -static inline int team_dev_queue_xmit(struct team *team, struct team_port *port, - struct sk_buff *skb) -{ - BUILD_BUG_ON(sizeof(skb->queue_mapping) != - sizeof(qdisc_skb_cb(skb)->slave_dev_queue_mapping)); - skb_set_queue_mapping(skb, qdisc_skb_cb(skb)->slave_dev_queue_mapping); - - skb->dev = port->dev; - if (unlikely(netpoll_tx_running(port->dev))) { - team_netpoll_send_skb(port, skb); - return 0; - } - return dev_queue_xmit(skb); -} - struct team_mode_ops { int (*init)(struct team *team); void (*exit)(struct team *team); @@ -200,6 +185,21 @@ struct team { long mode_priv[TEAM_MODE_PRIV_LONGS]; }; +static inline int team_dev_queue_xmit(struct team *team, struct team_port *port, + struct sk_buff *skb) +{ + BUILD_BUG_ON(sizeof(skb->queue_mapping) != + sizeof(qdisc_skb_cb(skb)->slave_dev_queue_mapping)); + skb_set_queue_mapping(skb, qdisc_skb_cb(skb)->slave_dev_queue_mapping); + + skb->dev = port->dev; + if (unlikely(netpoll_tx_running(team->dev))) { + team_netpoll_send_skb(port, skb); + return 0; + } + return dev_queue_xmit(skb); +} + static inline struct hlist_head *team_port_index_hash(struct team *team, int port_index) { diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c index e9466d41270..02015a505d2 100644 --- a/net/bridge/br_forward.c +++ b/net/bridge/br_forward.c @@ -65,7 +65,7 @@ static void __br_deliver(const struct net_bridge_port *to, struct sk_buff *skb) { skb->dev = to->dev; - if (unlikely(netpoll_tx_running(to->dev))) { + if (unlikely(netpoll_tx_running(to->br->dev))) { if (packet_length(skb) > skb->dev->mtu && !skb_is_gso(skb)) kfree_skb(skb); else { -- cgit v1.2.3-70-g09d2 From 77ab8a54d9a8dcc4a46484a04133314f33f2aba6 Mon Sep 17 00:00:00 2001 From: Amerigo Wang Date: Fri, 10 Aug 2012 01:24:46 +0000 Subject: netpoll: convert several functions to bool These functions are just boolean, let them return bool instead of int. Cc: David Miller Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- include/linux/netpoll.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h index 61aee86cf21..66d5379c305 100644 --- a/include/linux/netpoll.h +++ b/include/linux/netpoll.h @@ -66,7 +66,7 @@ static inline void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) #ifdef CONFIG_NETPOLL -static inline int netpoll_rx_on(struct sk_buff *skb) +static inline bool netpoll_rx_on(struct sk_buff *skb) { struct netpoll_info *npinfo = rcu_dereference_bh(skb->dev->npinfo); @@ -125,7 +125,7 @@ static inline void netpoll_poll_unlock(void *have) } } -static inline int netpoll_tx_running(struct net_device *dev) +static inline bool netpoll_tx_running(struct net_device *dev) { return irqs_disabled(); } @@ -133,11 +133,11 @@ static inline int netpoll_tx_running(struct net_device *dev) #else static inline bool netpoll_rx(struct sk_buff *skb) { - return 0; + return false; } -static inline int netpoll_rx_on(struct sk_buff *skb) +static inline bool netpoll_rx_on(struct sk_buff *skb) { - return 0; + return false; } static inline int netpoll_receive_skb(struct sk_buff *skb) { @@ -153,9 +153,9 @@ static inline void netpoll_poll_unlock(void *have) static inline void netpoll_netdev_init(struct net_device *dev) { } -static inline int netpoll_tx_running(struct net_device *dev) +static inline bool netpoll_tx_running(struct net_device *dev) { - return 0; + return false; } #endif -- cgit v1.2.3-70-g09d2 From 6024935f5ff5f1646bce8404416318e5fd4a0c4a Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Mon, 13 Aug 2012 02:49:59 +0000 Subject: llc2: Fix silent failure of llc_station_init() llc_station_init() creates and processes an event skb with no effect other than to change the state from DOWN to UP. Allocation failure is reported, but then ignored by its caller, llc2_init(). Remove this possibility by simply initialising the state as UP. Signed-off-by: Ben Hutchings Signed-off-by: David S. Miller --- include/net/llc.h | 2 +- net/llc/llc_station.c | 19 ++----------------- 2 files changed, 3 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/include/net/llc.h b/include/net/llc.h index 226c846cab0..f2d0fc57052 100644 --- a/include/net/llc.h +++ b/include/net/llc.h @@ -133,7 +133,7 @@ extern int llc_build_and_send_ui_pkt(struct llc_sap *sap, struct sk_buff *skb, extern void llc_sap_handler(struct llc_sap *sap, struct sk_buff *skb); extern void llc_conn_handler(struct llc_sap *sap, struct sk_buff *skb); -extern int llc_station_init(void); +extern void llc_station_init(void); extern void llc_station_exit(void); #ifdef CONFIG_PROC_FS diff --git a/net/llc/llc_station.c b/net/llc/llc_station.c index 6828e39ec2e..45ddbb93c5d 100644 --- a/net/llc/llc_station.c +++ b/net/llc/llc_station.c @@ -687,12 +687,8 @@ static void llc_station_rcv(struct sk_buff *skb) llc_station_state_process(skb); } -int __init llc_station_init(void) +void __init llc_station_init(void) { - int rc = -ENOBUFS; - struct sk_buff *skb; - struct llc_station_state_ev *ev; - skb_queue_head_init(&llc_main_station.mac_pdu_q); skb_queue_head_init(&llc_main_station.ev_q.list); spin_lock_init(&llc_main_station.ev_q.lock); @@ -700,20 +696,9 @@ int __init llc_station_init(void) (unsigned long)&llc_main_station); llc_main_station.ack_timer.expires = jiffies + sysctl_llc_station_ack_timeout; - skb = alloc_skb(0, GFP_ATOMIC); - if (!skb) - goto out; - rc = 0; llc_set_station_handler(llc_station_rcv); - ev = llc_station_ev(skb); - memset(ev, 0, sizeof(*ev)); llc_main_station.maximum_retry = 1; - llc_main_station.state = LLC_STATION_STATE_DOWN; - ev->type = LLC_STATION_EV_TYPE_SIMPLE; - ev->prim_type = LLC_STATION_EV_ENABLE_WITHOUT_DUP_ADDR_CHECK; - rc = llc_station_next_state(skb); -out: - return rc; + llc_main_station.state = LLC_STATION_STATE_UP; } void __exit llc_station_exit(void) -- cgit v1.2.3-70-g09d2 From 8857df3aceb7a8eb7558059b7da109e41dd1fb95 Mon Sep 17 00:00:00 2001 From: Michael Hennerich Date: Fri, 20 Jul 2012 09:31:00 +0100 Subject: iio: frequency: ADF4350: Fix potential reference div factor overflow. With small channel spacing values and high reference frequencies it is possible to exceed the range of the 10-bit counter. Workaround by checking the range and widening some constrains. We don't use the REG1_PHASE value in this case the datasheet recommends to set it to 1 if not used. Signed-off-by: Michael Hennerich Signed-off-by: Jonathan Cameron --- drivers/iio/frequency/adf4350.c | 24 +++++++++++++++--------- include/linux/iio/frequency/adf4350.h | 2 ++ 2 files changed, 17 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/drivers/iio/frequency/adf4350.c b/drivers/iio/frequency/adf4350.c index 59fbb3ae40e..e35bb8f6fe7 100644 --- a/drivers/iio/frequency/adf4350.c +++ b/drivers/iio/frequency/adf4350.c @@ -129,7 +129,7 @@ static int adf4350_set_freq(struct adf4350_state *st, unsigned long long freq) { struct adf4350_platform_data *pdata = st->pdata; u64 tmp; - u32 div_gcd, prescaler; + u32 div_gcd, prescaler, chspc; u16 mdiv, r_cnt = 0; u8 band_sel_div; @@ -158,14 +158,20 @@ static int adf4350_set_freq(struct adf4350_state *st, unsigned long long freq) if (pdata->ref_div_factor) r_cnt = pdata->ref_div_factor - 1; - do { - r_cnt = adf4350_tune_r_cnt(st, r_cnt); + chspc = st->chspc; - st->r1_mod = st->fpfd / st->chspc; - while (st->r1_mod > ADF4350_MAX_MODULUS) { - r_cnt = adf4350_tune_r_cnt(st, r_cnt); - st->r1_mod = st->fpfd / st->chspc; - } + do { + do { + do { + r_cnt = adf4350_tune_r_cnt(st, r_cnt); + st->r1_mod = st->fpfd / chspc; + if (r_cnt > ADF4350_MAX_R_CNT) { + /* try higher spacing values */ + chspc++; + r_cnt = 0; + } + } while ((st->r1_mod > ADF4350_MAX_MODULUS) && r_cnt); + } while (r_cnt == 0); tmp = freq * (u64)st->r1_mod + (st->fpfd > 1); do_div(tmp, st->fpfd); /* Div round closest (n + d/2)/d */ @@ -194,7 +200,7 @@ static int adf4350_set_freq(struct adf4350_state *st, unsigned long long freq) st->regs[ADF4350_REG0] = ADF4350_REG0_INT(st->r0_int) | ADF4350_REG0_FRACT(st->r0_fract); - st->regs[ADF4350_REG1] = ADF4350_REG1_PHASE(0) | + st->regs[ADF4350_REG1] = ADF4350_REG1_PHASE(1) | ADF4350_REG1_MOD(st->r1_mod) | prescaler; diff --git a/include/linux/iio/frequency/adf4350.h b/include/linux/iio/frequency/adf4350.h index b76b4a87065..be91f344d5f 100644 --- a/include/linux/iio/frequency/adf4350.h +++ b/include/linux/iio/frequency/adf4350.h @@ -87,6 +87,8 @@ #define ADF4350_MAX_BANDSEL_CLK 125000 /* Hz */ #define ADF4350_MAX_FREQ_REFIN 250000000 /* Hz */ #define ADF4350_MAX_MODULUS 4095 +#define ADF4350_MAX_R_CNT 1023 + /** * struct adf4350_platform_data - platform specific information -- cgit v1.2.3-70-g09d2 From ac5aa7f9e0891a115ab307b4bdde9c55b9232970 Mon Sep 17 00:00:00 2001 From: Richard Genoud Date: Fri, 10 Aug 2012 16:52:58 +0200 Subject: pinctrl: header: trivial: declare struct device As struct device is used as a function argument, it should at least be declared (device.h is not included). Signed-off-by: Richard Genoud Signed-off-by: Linus Walleij --- include/linux/pinctrl/consumer.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/pinctrl/consumer.h b/include/linux/pinctrl/consumer.h index 6dd96fb4548..e9b7f435084 100644 --- a/include/linux/pinctrl/consumer.h +++ b/include/linux/pinctrl/consumer.h @@ -20,6 +20,7 @@ /* This struct is private to the core and should be regarded as a cookie */ struct pinctrl; struct pinctrl_state; +struct device; #ifdef CONFIG_PINCTRL -- cgit v1.2.3-70-g09d2 From 8513915accc611e576dbebb93422c257e7e68be8 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 18 Aug 2012 17:43:05 -0700 Subject: ALSA: fix pcm.h kernel-doc warning and notation Fix kernel-doc warning in and add function name to make the kernel-doc notation complete. Warning(include/sound/pcm.h:1081): No description found for parameter 'substream' Signed-off-by: Randy Dunlap Signed-off-by: Takashi Iwai --- include/sound/pcm.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/sound/pcm.h b/include/sound/pcm.h index c75c0d1a85e..cdca2ab1e71 100644 --- a/include/sound/pcm.h +++ b/include/sound/pcm.h @@ -1075,7 +1075,8 @@ static inline void snd_pcm_limit_isa_dma_size(int dma, size_t *max) const char *snd_pcm_format_name(snd_pcm_format_t format); /** - * Get a string naming the direction of a stream + * snd_pcm_stream_str - Get a string naming the direction of a stream + * @substream: the pcm substream instance */ static inline const char *snd_pcm_stream_str(struct snd_pcm_substream *substream) { -- cgit v1.2.3-70-g09d2 From 3296193d1421c2d6f9e49e181cecfd917f0f5764 Mon Sep 17 00:00:00 2001 From: Timur Tabi Date: Tue, 14 Aug 2012 13:20:23 +0000 Subject: dt: introduce for_each_available_child_of_node, of_get_next_available_child Macro for_each_child_of_node() makes it easy to iterate over all of the children for a given device tree node, including those nodes that are marked as unavailable (i.e. status = "disabled"). Introduce for_each_available_child_of_node(), which is like for_each_child_of_node(), but it automatically skips unavailable nodes. This also requires the introduction of helper function of_get_next_available_child(), which returns the next available child node. Signed-off-by: Timur Tabi Signed-off-by: David S. Miller --- drivers/of/base.c | 27 +++++++++++++++++++++++++++ include/linux/of.h | 7 +++++++ 2 files changed, 34 insertions(+) (limited to 'include') diff --git a/drivers/of/base.c b/drivers/of/base.c index c181b94abc3..d4a1c9a043e 100644 --- a/drivers/of/base.c +++ b/drivers/of/base.c @@ -363,6 +363,33 @@ struct device_node *of_get_next_child(const struct device_node *node, } EXPORT_SYMBOL(of_get_next_child); +/** + * of_get_next_available_child - Find the next available child node + * @node: parent node + * @prev: previous child of the parent node, or NULL to get first + * + * This function is like of_get_next_child(), except that it + * automatically skips any disabled nodes (i.e. status = "disabled"). + */ +struct device_node *of_get_next_available_child(const struct device_node *node, + struct device_node *prev) +{ + struct device_node *next; + + read_lock(&devtree_lock); + next = prev ? prev->sibling : node->child; + for (; next; next = next->sibling) { + if (!of_device_is_available(next)) + continue; + if (of_node_get(next)) + break; + } + of_node_put(prev); + read_unlock(&devtree_lock); + return next; +} +EXPORT_SYMBOL(of_get_next_available_child); + /** * of_find_node_by_path - Find a node matching a full OF path * @path: The full path to match diff --git a/include/linux/of.h b/include/linux/of.h index 5919ee33f2b..1b1163225f3 100644 --- a/include/linux/of.h +++ b/include/linux/of.h @@ -190,10 +190,17 @@ extern struct device_node *of_get_parent(const struct device_node *node); extern struct device_node *of_get_next_parent(struct device_node *node); extern struct device_node *of_get_next_child(const struct device_node *node, struct device_node *prev); +extern struct device_node *of_get_next_available_child( + const struct device_node *node, struct device_node *prev); + #define for_each_child_of_node(parent, child) \ for (child = of_get_next_child(parent, NULL); child != NULL; \ child = of_get_next_child(parent, child)) +#define for_each_available_child_of_node(parent, child) \ + for (child = of_get_next_available_child(parent, NULL); child != NULL; \ + child = of_get_next_available_child(parent, child)) + static inline int of_get_child_count(const struct device_node *np) { struct device_node *child; -- cgit v1.2.3-70-g09d2 From c0de08d04215031d68fa13af36f347a6cfa252ca Mon Sep 17 00:00:00 2001 From: Eric Leblond Date: Thu, 16 Aug 2012 22:02:58 +0000 Subject: af_packet: don't emit packet on orig fanout group If a packet is emitted on one socket in one group of fanout sockets, it is transmitted again. It is thus read again on one of the sockets of the fanout group. This result in a loop for software which generate packets when receiving one. This retransmission is not the intended behavior: a fanout group must behave like a single socket. The packet should not be transmitted on a socket if it originates from a socket belonging to the same fanout group. This patch fixes the issue by changing the transmission check to take fanout group info account. Reported-by: Aleksandr Kotov Signed-off-by: Eric Leblond Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 ++ net/core/dev.c | 16 ++++++++++++++-- net/packet/af_packet.c | 9 +++++++++ 3 files changed, 25 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 3560d688161..59dc05f3824 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1522,6 +1522,8 @@ struct packet_type { struct sk_buff **(*gro_receive)(struct sk_buff **head, struct sk_buff *skb); int (*gro_complete)(struct sk_buff *skb); + bool (*id_match)(struct packet_type *ptype, + struct sock *sk); void *af_packet_priv; struct list_head list; }; diff --git a/net/core/dev.c b/net/core/dev.c index a39354ee143..debd9372472 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1642,6 +1642,19 @@ static inline int deliver_skb(struct sk_buff *skb, return pt_prev->func(skb, skb->dev, pt_prev, orig_dev); } +static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb) +{ + if (ptype->af_packet_priv == NULL) + return false; + + if (ptype->id_match) + return ptype->id_match(ptype, skb->sk); + else if ((struct sock *)ptype->af_packet_priv == skb->sk) + return true; + + return false; +} + /* * Support routine. Sends outgoing frames to any network * taps currently in use. @@ -1659,8 +1672,7 @@ static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) * they originated from - MvS (miquels@drinkel.ow.org) */ if ((ptype->dev == dev || !ptype->dev) && - (ptype->af_packet_priv == NULL || - (struct sock *)ptype->af_packet_priv != skb->sk)) { + (!skb_loop_sk(ptype, skb))) { if (pt_prev) { deliver_skb(skb2, pt_prev, skb->dev); pt_prev = ptype; diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 8ac890a1a4c..aee7196aac3 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1273,6 +1273,14 @@ static void __fanout_unlink(struct sock *sk, struct packet_sock *po) spin_unlock(&f->lock); } +bool match_fanout_group(struct packet_type *ptype, struct sock * sk) +{ + if (ptype->af_packet_priv == (void*)((struct packet_sock *)sk)->fanout) + return true; + + return false; +} + static int fanout_add(struct sock *sk, u16 id, u16 type_flags) { struct packet_sock *po = pkt_sk(sk); @@ -1325,6 +1333,7 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags) match->prot_hook.dev = po->prot_hook.dev; match->prot_hook.func = packet_rcv_fanout; match->prot_hook.af_packet_priv = match; + match->prot_hook.id_match = match_fanout_group; dev_add_pack(&match->prot_hook); list_add(&match->list, &fanout_list); } -- cgit v1.2.3-70-g09d2 From 9d7b0fc1ef1f17aff57c0dc9a59453d8fca255c3 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Mon, 20 Aug 2012 02:56:56 -0700 Subject: net: ipv6: fix oops in inet_putpeer() Commit 97bab73f (inet: Hide route peer accesses behind helpers.) introduced a bug in xfrm6_policy_destroy(). The xfrm_dst's _rt6i_peer member is not initialized, causing a false positive result from inetpeer_ptr_is_peer(), which in turn causes a NULL pointer dereference in inet_putpeer(). Pid: 314, comm: kworker/0:1 Not tainted 3.6.0-rc1+ #17 To Be Filled By O.E.M. To Be Filled By O.E.M./P4S800D-X EIP: 0060:[] EFLAGS: 00010246 CPU: 0 EIP is at inet_putpeer+0xe/0x16 EAX: 00000000 EBX: f3481700 ECX: 00000000 EDX: 000dd641 ESI: f3481700 EDI: c05e949c EBP: f551def4 ESP: f551def4 DS: 007b ES: 007b FS: 0000 GS: 00e0 SS: 0068 CR0: 8005003b CR2: 00000070 CR3: 3243d000 CR4: 00000750 DR0: 00000000 DR1: 00000000 DR2: 00000000 DR3: 00000000 DR6: ffff0ff0 DR7: 00000400 f551df04 c0423de1 00000000 f3481700 f551df18 c038d5f7 f254b9f8 f551df28 f34f85d8 f551df20 c03ef48d f551df3c c0396870 f30697e8 f24e1738 c05e98f4 f5509540 c05cd2b4 f551df7c c0142d2b c043feb5 f5509540 00000000 c05cd2e8 [] xfrm6_dst_destroy+0x42/0xdb [] dst_destroy+0x1d/0xa4 [] xfrm_bundle_flo_delete+0x2b/0x36 [] flow_cache_gc_task+0x85/0x9f [] process_one_work+0x122/0x441 [] ? apic_timer_interrupt+0x31/0x38 [] ? flow_cache_new_hashrnd+0x2b/0x2b [] worker_thread+0x113/0x3cc Fix by adding a init_dst() callback to struct xfrm_policy_afinfo to properly initialize the dst's peer pointer. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- include/net/xfrm.h | 2 ++ net/ipv6/xfrm6_policy.c | 8 ++++++++ net/xfrm/xfrm_policy.c | 2 ++ 3 files changed, 12 insertions(+) (limited to 'include') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 62b619e82a9..976a81abe1a 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -292,6 +292,8 @@ struct xfrm_policy_afinfo { struct flowi *fl, int reverse); int (*get_tos)(const struct flowi *fl); + void (*init_dst)(struct net *net, + struct xfrm_dst *dst); int (*init_path)(struct xfrm_dst *path, struct dst_entry *dst, int nfheader_len); diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index ef39812107b..f8c4c08ffb6 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -73,6 +73,13 @@ static int xfrm6_get_tos(const struct flowi *fl) return 0; } +static void xfrm6_init_dst(struct net *net, struct xfrm_dst *xdst) +{ + struct rt6_info *rt = (struct rt6_info *)xdst; + + rt6_init_peer(rt, net->ipv6.peers); +} + static int xfrm6_init_path(struct xfrm_dst *path, struct dst_entry *dst, int nfheader_len) { @@ -286,6 +293,7 @@ static struct xfrm_policy_afinfo xfrm6_policy_afinfo = { .get_saddr = xfrm6_get_saddr, .decode_session = _decode_session6, .get_tos = xfrm6_get_tos, + .init_dst = xfrm6_init_dst, .init_path = xfrm6_init_path, .fill_dst = xfrm6_fill_dst, .blackhole_route = ip6_blackhole_route, diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index c5a5165a592..5a2aa17e4d3 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -1357,6 +1357,8 @@ static inline struct xfrm_dst *xfrm_alloc_dst(struct net *net, int family) memset(dst + 1, 0, sizeof(*xdst) - sizeof(*dst)); xdst->flo.ops = &xfrm_bundle_fc_ops; + if (afinfo->init_dst) + afinfo->init_dst(net, xdst); } else xdst = ERR_PTR(-ENOBUFS); -- cgit v1.2.3-70-g09d2 From af74115eed22698f771fec1287a864975c9a6671 Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Wed, 15 Aug 2012 21:24:52 -0700 Subject: target: Remove unused se_cmd.cmd_spdtl This was originally for helping fabrics to determine overflow/underflow status, and has been superceeded by SCF_OVERFLOW_BIT + SCF_UNDERFLOW_BIT. Signed-off-by: Roland Dreier Signed-off-by: Nicholas Bellinger --- drivers/target/target_core_transport.c | 2 -- include/target/target_core_base.h | 2 -- 2 files changed, 4 deletions(-) (limited to 'include') diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c index ea9a3d2e4f5..4de3186dc44 100644 --- a/drivers/target/target_core_transport.c +++ b/drivers/target/target_core_transport.c @@ -1165,8 +1165,6 @@ int target_cmd_size_check(struct se_cmd *cmd, unsigned int size) " 0x%02x\n", cmd->se_tfo->get_fabric_name(), cmd->data_length, size, cmd->t_task_cdb[0]); - cmd->cmd_spdtl = size; - if (cmd->data_direction == DMA_TO_DEVICE) { pr_err("Rejecting underflow/overflow" " WRITE data\n"); diff --git a/include/target/target_core_base.h b/include/target/target_core_base.h index 128ce46fa48..015cea01ae3 100644 --- a/include/target/target_core_base.h +++ b/include/target/target_core_base.h @@ -503,8 +503,6 @@ struct se_cmd { u32 se_ordered_id; /* Total size in bytes associated with command */ u32 data_length; - /* SCSI Presented Data Transfer Length */ - u32 cmd_spdtl; u32 residual_count; u32 orig_fe_lun; /* Persistent Reservation key */ -- cgit v1.2.3-70-g09d2 From e0e3cea46d31d23dc40df0a49a7a2c04fe8edfea Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 21 Aug 2012 06:21:17 +0000 Subject: af_netlink: force credentials passing [CVE-2012-3520] Pablo Neira Ayuso discovered that avahi and potentially NetworkManager accept spoofed Netlink messages because of a kernel bug. The kernel passes all-zero SCM_CREDENTIALS ancillary data to the receiver if the sender did not provide such data, instead of not including any such data at all or including the correct data from the peer (as it is the case with AF_UNIX). This bug was introduced in commit 16e572626961 (af_unix: dont send SCM_CREDENTIALS by default) This patch forces passing credentials for netlink, as before the regression. Another fix would be to not add SCM_CREDENTIALS in netlink messages if not provided by the sender, but it might break some programs. With help from Florian Weimer & Petr Matousek This issue is designated as CVE-2012-3520 Signed-off-by: Eric Dumazet Cc: Petr Matousek Cc: Florian Weimer Cc: Pablo Neira Ayuso Signed-off-by: David S. Miller --- include/net/scm.h | 4 +++- net/netlink/af_netlink.c | 2 +- net/unix/af_unix.c | 4 ++-- 3 files changed, 6 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/net/scm.h b/include/net/scm.h index 079d7887dac..7dc0854f0b3 100644 --- a/include/net/scm.h +++ b/include/net/scm.h @@ -70,9 +70,11 @@ static __inline__ void scm_destroy(struct scm_cookie *scm) } static __inline__ int scm_send(struct socket *sock, struct msghdr *msg, - struct scm_cookie *scm) + struct scm_cookie *scm, bool forcecreds) { memset(scm, 0, sizeof(*scm)); + if (forcecreds) + scm_set_cred(scm, task_tgid(current), current_cred()); unix_get_peersec_dgram(sock, scm); if (msg->msg_controllen <= 0) return 0; diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 5463969da45..1445d73533e 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -1362,7 +1362,7 @@ static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock, if (NULL == siocb->scm) siocb->scm = &scm; - err = scm_send(sock, msg, siocb->scm); + err = scm_send(sock, msg, siocb->scm, true); if (err < 0) return err; diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index e4768c180da..c5ee4ff6136 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1450,7 +1450,7 @@ static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock, if (NULL == siocb->scm) siocb->scm = &tmp_scm; wait_for_unix_gc(); - err = scm_send(sock, msg, siocb->scm); + err = scm_send(sock, msg, siocb->scm, false); if (err < 0) return err; @@ -1619,7 +1619,7 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock, if (NULL == siocb->scm) siocb->scm = &tmp_scm; wait_for_unix_gc(); - err = scm_send(sock, msg, siocb->scm); + err = scm_send(sock, msg, siocb->scm, false); if (err < 0) return err; -- cgit v1.2.3-70-g09d2 From 04ccfe77f132b973659f11954443214659014072 Mon Sep 17 00:00:00 2001 From: Damien Lespiau Date: Fri, 17 Aug 2012 14:20:02 +0000 Subject: drm: Remove two unused fields from struct drm_display_mode Signed-off-by: Damien Lespiau Reviewed-by: Jani Nikula Signed-off-by: Dave Airlie --- drivers/gpu/drm/drm_modes.c | 3 --- include/drm/drm_crtc.h | 2 -- 2 files changed, 5 deletions(-) (limited to 'include') diff --git a/drivers/gpu/drm/drm_modes.c b/drivers/gpu/drm/drm_modes.c index b7adb4a967f..28637c181b1 100644 --- a/drivers/gpu/drm/drm_modes.c +++ b/drivers/gpu/drm/drm_modes.c @@ -706,9 +706,6 @@ void drm_mode_set_crtcinfo(struct drm_display_mode *p, int adjust_flags) p->crtc_vblank_end = max(p->crtc_vsync_end, p->crtc_vtotal); p->crtc_hblank_start = min(p->crtc_hsync_start, p->crtc_hdisplay); p->crtc_hblank_end = max(p->crtc_hsync_end, p->crtc_htotal); - - p->crtc_hadjusted = false; - p->crtc_vadjusted = false; } EXPORT_SYMBOL(drm_mode_set_crtcinfo); diff --git a/include/drm/drm_crtc.h b/include/drm/drm_crtc.h index a1a0386e016..ced362533e3 100644 --- a/include/drm/drm_crtc.h +++ b/include/drm/drm_crtc.h @@ -166,8 +166,6 @@ struct drm_display_mode { int crtc_vsync_start; int crtc_vsync_end; int crtc_vtotal; - int crtc_hadjusted; - int crtc_vadjusted; /* Driver private mode info */ int private_size; -- cgit v1.2.3-70-g09d2 From c3a5ce0416b6c172a23bc8a3760d8704d3d1535b Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Tue, 21 Aug 2012 16:16:00 -0700 Subject: string: do not export memweight() to userspace Fix the following warning: usr/include/linux/string.h:8: userspace cannot reference function or variable defined in the kernel Signed-off-by: WANG Cong Acked-by: Akinobu Mita Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/string.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/string.h b/include/linux/string.h index ffe0442e18d..b9178812d9d 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -144,8 +144,8 @@ static inline bool strstarts(const char *str, const char *prefix) { return strncmp(str, prefix, strlen(prefix)) == 0; } -#endif extern size_t memweight(const void *ptr, size_t bytes); +#endif /* __KERNEL__ */ #endif /* _LINUX_STRING_H_ */ -- cgit v1.2.3-70-g09d2 From c67fe3752abe6ab47639e2f9b836900c3dc3da84 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 21 Aug 2012 16:16:17 -0700 Subject: mm: compaction: Abort async compaction if locks are contended or taking too long Jim Schutt reported a problem that pointed at compaction contending heavily on locks. The workload is straight-forward and in his own words; The systems in question have 24 SAS drives spread across 3 HBAs, running 24 Ceph OSD instances, one per drive. FWIW these servers are dual-socket Intel 5675 Xeons w/48 GB memory. I've got ~160 Ceph Linux clients doing dd simultaneously to a Ceph file system backed by 12 of these servers. Early in the test everything looks fine procs -------------------memory------------------ ---swap-- -----io---- --system-- -----cpu------- r b swpd free buff cache si so bi bo in cs us sy id wa st 31 15 0 287216 576 38606628 0 0 2 1158 2 14 1 3 95 0 0 27 15 0 225288 576 38583384 0 0 18 2222016 203357 134876 11 56 17 15 0 28 17 0 219256 576 38544736 0 0 11 2305932 203141 146296 11 49 23 17 0 6 18 0 215596 576 38552872 0 0 7 2363207 215264 166502 12 45 22 20 0 22 18 0 226984 576 38596404 0 0 3 2445741 223114 179527 12 43 23 22 0 and then it goes to pot procs -------------------memory------------------ ---swap-- -----io---- --system-- -----cpu------- r b swpd free buff cache si so bi bo in cs us sy id wa st 163 8 0 464308 576 36791368 0 0 11 22210 866 536 3 13 79 4 0 207 14 0 917752 576 36181928 0 0 712 1345376 134598 47367 7 90 1 2 0 123 12 0 685516 576 36296148 0 0 429 1386615 158494 60077 8 84 5 3 0 123 12 0 598572 576 36333728 0 0 1107 1233281 147542 62351 7 84 5 4 0 622 7 0 660768 576 36118264 0 0 557 1345548 151394 59353 7 85 4 3 0 223 11 0 283960 576 36463868 0 0 46 1107160 121846 33006 6 93 1 1 0 Note that system CPU usage is very high blocks being written out has dropped by 42%. He analysed this with perf and found perf record -g -a sleep 10 perf report --sort symbol --call-graph fractal,5 34.63% [k] _raw_spin_lock_irqsave | |--97.30%-- isolate_freepages | compaction_alloc | unmap_and_move | migrate_pages | compact_zone | compact_zone_order | try_to_compact_pages | __alloc_pages_direct_compact | __alloc_pages_slowpath | __alloc_pages_nodemask | alloc_pages_vma | do_huge_pmd_anonymous_page | handle_mm_fault | do_page_fault | page_fault | | | |--87.39%-- skb_copy_datagram_iovec | | tcp_recvmsg | | inet_recvmsg | | sock_recvmsg | | sys_recvfrom | | system_call | | __recv | | | | | --100.00%-- (nil) | | | --12.61%-- memcpy --2.70%-- [...] There was other data but primarily it is all showing that compaction is contended heavily on the zone->lock and zone->lru_lock. commit [b2eef8c0: mm: compaction: minimise the time IRQs are disabled while isolating pages for migration] noted that it was possible for migration to hold the lru_lock for an excessive amount of time. Very broadly speaking this patch expands the concept. This patch introduces compact_checklock_irqsave() to check if a lock is contended or the process needs to be scheduled. If either condition is true then async compaction is aborted and the caller is informed. The page allocator will fail a THP allocation if compaction failed due to contention. This patch also introduces compact_trylock_irqsave() which will acquire the lock only if it is not contended and the process does not need to schedule. Reported-by: Jim Schutt Tested-by: Jim Schutt Signed-off-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compaction.h | 4 +- mm/compaction.c | 100 +++++++++++++++++++++++++++++++++++---------- mm/internal.h | 1 + mm/page_alloc.c | 17 +++++--- 4 files changed, 93 insertions(+), 29 deletions(-) (limited to 'include') diff --git a/include/linux/compaction.h b/include/linux/compaction.h index 133ddcf8339..ef658147e4e 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -22,7 +22,7 @@ extern int sysctl_extfrag_handler(struct ctl_table *table, int write, extern int fragmentation_index(struct zone *zone, unsigned int order); extern unsigned long try_to_compact_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *mask, - bool sync); + bool sync, bool *contended); extern int compact_pgdat(pg_data_t *pgdat, int order); extern unsigned long compaction_suitable(struct zone *zone, int order); @@ -64,7 +64,7 @@ static inline bool compaction_deferred(struct zone *zone, int order) #else static inline unsigned long try_to_compact_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *nodemask, - bool sync) + bool sync, bool *contended) { return COMPACT_CONTINUE; } diff --git a/mm/compaction.c b/mm/compaction.c index bcce7897e17..7fcd3a52e68 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -50,6 +50,47 @@ static inline bool migrate_async_suitable(int migratetype) return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE; } +/* + * Compaction requires the taking of some coarse locks that are potentially + * very heavily contended. Check if the process needs to be scheduled or + * if the lock is contended. For async compaction, back out in the event + * if contention is severe. For sync compaction, schedule. + * + * Returns true if the lock is held. + * Returns false if the lock is released and compaction should abort + */ +static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags, + bool locked, struct compact_control *cc) +{ + if (need_resched() || spin_is_contended(lock)) { + if (locked) { + spin_unlock_irqrestore(lock, *flags); + locked = false; + } + + /* async aborts if taking too long or contended */ + if (!cc->sync) { + if (cc->contended) + *cc->contended = true; + return false; + } + + cond_resched(); + if (fatal_signal_pending(current)) + return false; + } + + if (!locked) + spin_lock_irqsave(lock, *flags); + return true; +} + +static inline bool compact_trylock_irqsave(spinlock_t *lock, + unsigned long *flags, struct compact_control *cc) +{ + return compact_checklock_irqsave(lock, flags, false, cc); +} + /* * Isolate free pages onto a private freelist. Caller must hold zone->lock. * If @strict is true, will abort returning 0 on any invalid PFNs or non-free @@ -173,7 +214,7 @@ isolate_freepages_range(unsigned long start_pfn, unsigned long end_pfn) } /* Update the number of anon and file isolated pages in the zone */ -static void acct_isolated(struct zone *zone, struct compact_control *cc) +static void acct_isolated(struct zone *zone, bool locked, struct compact_control *cc) { struct page *page; unsigned int count[2] = { 0, }; @@ -181,8 +222,14 @@ static void acct_isolated(struct zone *zone, struct compact_control *cc) list_for_each_entry(page, &cc->migratepages, lru) count[!!page_is_file_cache(page)]++; - __mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]); - __mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]); + /* If locked we can use the interrupt unsafe versions */ + if (locked) { + __mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]); + __mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]); + } else { + mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]); + mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]); + } } /* Similar to reclaim, but different enough that they don't share logic */ @@ -228,6 +275,8 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, struct list_head *migratelist = &cc->migratepages; isolate_mode_t mode = 0; struct lruvec *lruvec; + unsigned long flags; + bool locked; /* * Ensure that there are not too many pages isolated from the LRU @@ -247,25 +296,22 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, /* Time to isolate some pages for migration */ cond_resched(); - spin_lock_irq(&zone->lru_lock); + spin_lock_irqsave(&zone->lru_lock, flags); + locked = true; for (; low_pfn < end_pfn; low_pfn++) { struct page *page; - bool locked = true; /* give a chance to irqs before checking need_resched() */ if (!((low_pfn+1) % SWAP_CLUSTER_MAX)) { - spin_unlock_irq(&zone->lru_lock); + spin_unlock_irqrestore(&zone->lru_lock, flags); locked = false; } - if (need_resched() || spin_is_contended(&zone->lru_lock)) { - if (locked) - spin_unlock_irq(&zone->lru_lock); - cond_resched(); - spin_lock_irq(&zone->lru_lock); - if (fatal_signal_pending(current)) - break; - } else if (!locked) - spin_lock_irq(&zone->lru_lock); + + /* Check if it is ok to still hold the lock */ + locked = compact_checklock_irqsave(&zone->lru_lock, &flags, + locked, cc); + if (!locked) + break; /* * migrate_pfn does not necessarily start aligned to a @@ -349,9 +395,10 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, } } - acct_isolated(zone, cc); + acct_isolated(zone, locked, cc); - spin_unlock_irq(&zone->lru_lock); + if (locked) + spin_unlock_irqrestore(&zone->lru_lock, flags); trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); @@ -461,7 +508,16 @@ static void isolate_freepages(struct zone *zone, * are disabled */ isolated = 0; - spin_lock_irqsave(&zone->lock, flags); + + /* + * The zone lock must be held to isolate freepages. This + * unfortunately this is a very coarse lock and can be + * heavily contended if there are parallel allocations + * or parallel compactions. For async compaction do not + * spin on the lock + */ + if (!compact_trylock_irqsave(&zone->lock, &flags, cc)) + break; if (suitable_migration_target(page)) { end_pfn = min(pfn + pageblock_nr_pages, zone_end_pfn); isolated = isolate_freepages_block(pfn, end_pfn, @@ -773,7 +829,7 @@ out: static unsigned long compact_zone_order(struct zone *zone, int order, gfp_t gfp_mask, - bool sync) + bool sync, bool *contended) { struct compact_control cc = { .nr_freepages = 0, @@ -782,6 +838,7 @@ static unsigned long compact_zone_order(struct zone *zone, .migratetype = allocflags_to_migratetype(gfp_mask), .zone = zone, .sync = sync, + .contended = contended, }; INIT_LIST_HEAD(&cc.freepages); INIT_LIST_HEAD(&cc.migratepages); @@ -803,7 +860,7 @@ int sysctl_extfrag_threshold = 500; */ unsigned long try_to_compact_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *nodemask, - bool sync) + bool sync, bool *contended) { enum zone_type high_zoneidx = gfp_zone(gfp_mask); int may_enter_fs = gfp_mask & __GFP_FS; @@ -827,7 +884,8 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, nodemask) { int status; - status = compact_zone_order(zone, order, gfp_mask, sync); + status = compact_zone_order(zone, order, gfp_mask, sync, + contended); rc = max(status, rc); /* If a normal allocation would succeed, stop compacting */ diff --git a/mm/internal.h b/mm/internal.h index 3314f79d775..b8c91b342e2 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -130,6 +130,7 @@ struct compact_control { int order; /* order a direct compactor needs */ int migratetype; /* MOVABLE, RECLAIMABLE etc */ struct zone *zone; + bool *contended; /* True if a lock was contended */ }; unsigned long diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 07f19248acb..c66fb875104 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2102,7 +2102,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, int migratetype, bool sync_migration, - bool *deferred_compaction, + bool *contended_compaction, bool *deferred_compaction, unsigned long *did_some_progress) { struct page *page; @@ -2117,7 +2117,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, current->flags |= PF_MEMALLOC; *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, - nodemask, sync_migration); + nodemask, sync_migration, + contended_compaction); current->flags &= ~PF_MEMALLOC; if (*did_some_progress != COMPACT_SKIPPED) { @@ -2163,7 +2164,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, int migratetype, bool sync_migration, - bool *deferred_compaction, + bool *contended_compaction, bool *deferred_compaction, unsigned long *did_some_progress) { return NULL; @@ -2336,6 +2337,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, unsigned long did_some_progress; bool sync_migration = false; bool deferred_compaction = false; + bool contended_compaction = false; /* * In the slowpath, we sanity check order to avoid ever trying to @@ -2425,6 +2427,7 @@ rebalance: nodemask, alloc_flags, preferred_zone, migratetype, sync_migration, + &contended_compaction, &deferred_compaction, &did_some_progress); if (page) @@ -2434,10 +2437,11 @@ rebalance: /* * If compaction is deferred for high-order allocations, it is because * sync compaction recently failed. In this is the case and the caller - * has requested the system not be heavily disrupted, fail the - * allocation now instead of entering direct reclaim + * requested a movable allocation that does not heavily disrupt the + * system then fail the allocation instead of entering direct reclaim. */ - if (deferred_compaction && (gfp_mask & __GFP_NO_KSWAPD)) + if ((deferred_compaction || contended_compaction) && + (gfp_mask & __GFP_NO_KSWAPD)) goto nopage; /* Try direct reclaim and then allocating */ @@ -2508,6 +2512,7 @@ rebalance: nodemask, alloc_flags, preferred_zone, migratetype, sync_migration, + &contended_compaction, &deferred_compaction, &did_some_progress); if (page) -- cgit v1.2.3-70-g09d2 From 8ad5db8a8ddbe3bd33078863a027011e28f1f4ee Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 17 Aug 2012 20:10:46 -0400 Subject: introduce kref_put_mutex() equivalent of mutex_lock(mutex); if (!kref_put(kref, release)) mutex_unlock(mutex); Signed-off-by: Al Viro --- include/linux/kref.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include') diff --git a/include/linux/kref.h b/include/linux/kref.h index 9c07dcebded..65af6887872 100644 --- a/include/linux/kref.h +++ b/include/linux/kref.h @@ -18,6 +18,7 @@ #include #include #include +#include struct kref { atomic_t refcount; @@ -93,4 +94,21 @@ static inline int kref_put(struct kref *kref, void (*release)(struct kref *kref) { return kref_sub(kref, 1, release); } + +static inline int kref_put_mutex(struct kref *kref, + void (*release)(struct kref *kref), + struct mutex *lock) +{ + WARN_ON(release == NULL); + if (unlikely(!atomic_add_unless(&kref->refcount, -1, 1))) { + mutex_lock(lock); + if (unlikely(!atomic_dec_and_test(&kref->refcount))) { + mutex_unlock(lock); + return 0; + } + release(kref); + return 1; + } + return 0; +} #endif /* _KREF_H_ */ -- cgit v1.2.3-70-g09d2